In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn import svm
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [2]:
df_raw = pd.read_csv("nasa.csv")

In [3]:
df = df_raw.copy()

In [4]:
def encoding(data, column_name, function_name):
    ''' data = df
        column_name = populate only 1 column name!
        function_name - choose one of the below:
            LabelEncoder
            OneHotEncoder
            get_dummies
            OrdinalEncoder
    '''
    if function_name == LabelEncoder:
        column = data[column_name]
        encoder = function_name()
        encoder.fit(column)
        data[column_name] = encoder.fit_transform(column)
    elif function_name == OneHotEncoder:
        column = np.array(data[column_name]).reshape(-1,1)
        encoder = function_name(drop = 'first', sparse=False)
        encoder.fit(column)
        new = encoder.transform(column)
        categories = encoder.categories_
        for i in range(len(categories[0]) - 1):
            data[column_name + "_" + categories[0][i]] = pd.Series(new[:,i])
        data.drop(columns=column_name, inplace=True)
    elif function_name == get_dummies:
        pd.get_dummies(data, drop_first=True, columns = [column_name])
    elif function_name == OrdinalEncoder:
        column = np.array(data[column_name]).reshape(-1,1)
        encoder = function_name()
        encoder.fit(column)
        new = encoder.transform(column)
        data[column_name] = pd.Series(new[:,0])

    else:
        print('Some data must have been populated incorrectly, please check again')

In [5]:
def standardization(data, column_name, function_name):
    """ data = df
        column_name = populate only 1 column name!
        function_name - choose one of the below:
            standardize: return (column_name - column_name.mean()) / column_name.std()
            StandardScaler
            scale
    """
    if function_name == standardize:
        column = data[column_name]
        data[column_name] = (column - column.mean()) / column.std()
    elif function_name == StandardScaler:
        column = data[[column_name]]
        scaler = StandardScaler()
        data[column_name] = scaler.fit_transform(column)
    elif function_name == scale:
        column = data[column_name]
        data[column_name] = scale(column)
    else:
        print('Some data must have been populated incorrectly, please check again')

In [6]:
def drop_data(data, columns):
    """ as a 'columns', provide a list of columns' names
    """
    data.drop(columns, axis='columns', inplace=True)

In [7]:
drop_data(df, ['Orbiting Body', 'Equinox', 'Est Dia in M(min)', 'Est Dia in M(max)', 'Close Approach Date', 
           'Est Dia in Miles(min)','Est Dia in Miles(max)', 'Est Dia in Feet(min)', 'Est Dia in Feet(max)',
           'Miss Dist.(Astronomical)', 'Miss Dist.(lunar)', 'Miss Dist.(miles)', 'Orbit Determination Date'])

In [8]:
# def datasets(data, column_name, test_size, random_state):
#     target = data[column_name]
#     new_data = data.copy()
#     drop_data(new_data, column_name)
#     X = new_data
#     Y = target
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

In [9]:
encoding(df, 'Hazardous', LabelEncoder)

In [10]:
# datasets(df, 'Hazardous', 0.2, 42)

In [11]:
data = df
column_name = 'Hazardous'
test_size = 0.2
random_state = 42

target = data[column_name]
new_data = data.copy()
drop_data(new_data, column_name)
X = new_data
Y = target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = test_size, random_state = random_state)

In [12]:
def model_basics(model_name):
    if model_name == 'lg':
        model = LogisticRegression(random_state=42)
    elif model_name == 'dt':
        model = DecisionTreeClassifier(max_depth=4 , min_samples_leaf=3)
    elif model_name == 'rf':
        model = RandomForestClassifier(oob_score=True)
    elif model_name == 'knn':
        model = KNeighborsClassifier(n_neighbors=2,leaf_size=20, algorithm='kd_tree',p=1)
    elif model_name == 'kmeans':
        model = KMeans(n_clusters=2)
    elif model_name == 'xgboost':
        model = xgb.XGBRegressor(seed=123, n_estimators=5, max_depth=7)
    elif model_name == 'svc':
        model = svm.SVC(kernel = 'rbf')
    else:
        print("You chose wrong name. Please check again")
    
    model.fit(X, Y)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4687 entries, 0 to 4686
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Neo Reference ID              4687 non-null   int64  
 1   Name                          4687 non-null   int64  
 2   Absolute Magnitude            4687 non-null   float64
 3   Est Dia in KM(min)            4687 non-null   float64
 4   Est Dia in KM(max)            4687 non-null   float64
 5   Epoch Date Close Approach     4687 non-null   int64  
 6   Relative Velocity km per sec  4687 non-null   float64
 7   Relative Velocity km per hr   4687 non-null   float64
 8   Miles per hour                4687 non-null   float64
 9   Miss Dist.(kilometers)        4687 non-null   float64
 10  Orbit ID                      4687 non-null   int64  
 11  Orbit Uncertainity            4687 non-null   int64  
 12  Minimum Orbit Intersection    4687 non-null   float64
 13  Jup

In [14]:
def model_hyper(model_name):
    """ 'lg' - LogisticRegression
        'dt' - DecisionTreeClassifier
        'rf' - RandomForestClassifier
        'knn' - KNeighborsClassifier
        'kmeans' - KMeans
        'xgboost' - XGBRegressor
        'svc' -  SVC
    """
    if model_name == 'lg':
        model = LogisticRegression()
    elif model_name == 'dt':
        model = DecisionTreeClassifier()
    elif model_name == 'rf':
        model = RandomForestClassifier()
        param_grid = {'max_depth': [3, None],
                      'max_features': list(range(1,6)),
                      'n_estimators':[10,50,100,200],
                      'min_samples_leaf':[1,3,5]}
    elif model_name == 'knn':
        model = KNeighborsClassifier()
        param_grid = {"n_neighbors": list(range(2, 20, 2))}
    elif model_name == 'kmeans':
        model = KMeans()
    elif model_name == 'xgboost': 
        model = xgb.XGBRegressor()
        param_grid = {'n_estimators': [50, 100, 200, 500],
                      'max_depth': [4, 5, 6, 7],
                      'learning_rate': [0.05, 0.1, 0.25, 0.5],
                      'subsample': [0.75, 1.00], 
                      'tree_method': ['gpu_hist']}
    elif model_name == 'svc':
        model = svm.SVC()
    else:
        print("You chose wrong name. Please check again")
        
    grid_search = GridSearchCV(model, param_grid=param_grid, cv=params.cv, verbose=0)
    grid_search.fit(X, Y)
    best_parameters = grid_search.best_params_  
    return best_parameters

In [15]:
def model_sampling(sample):
    """ 'rus' = RandomUnderSampler
        'ros' = RandomOverSampler
    """
    if sample == 'rus':
        rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
        x_rus, y_rus = rus.fit_resample(X, Y)

        print('original dataset shape:', Counter(Y))
        print('Resample dataset shape', Counter(y_rus))

    elif sample == 'ros':
        ros = RandomOverSampler(random_state=42)
        x_ros, y_ros = ros.fit_resample(X, Y)
        
        print('Original dataset shape', Counter(y))
        print('Resample dataset shape', Counter(y_ros))

    else:
        print("You chose wrong name. Please check again")

In [16]:
def feature_importance(model):
    model.fit(X, Y)
    # plot feature importance
    plot_importance(model)
    pyplot.show()

In [17]:
def test_results(X, Y, model, model_name):
#     print(X.to_dict('records'))
    predict = model.predict(X)
    acc = accuracy_score(Y, predict)
    f1 = f1_score(Y, predict, average='micro')
    result = pd.DataFrame({'model':[model_name],'accuracy':[acc],'f1':[f1]})
    return result, acc, f1