In [1]:
import pandas as pd
import pandas_profiling
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, make_scorer

from sklearnext.over_sampling import SMOTE, GeometricSMOTE #scikit-learn extensions from IMS-ML-Lab
from sklearnext.model_selection import ModelSearchCV
from sklearnext.tools import report_model_search_results

from imblearn.pipeline import Pipeline

from collections import Counter



In [2]:
df = pd.read_csv('tabexport2.csv', delimiter=';')

new_columns = {}
for col in df.columns[1:]:
    new_columns[col] = col.replace('LC08_L1TP_204032_2015', '')[:4]+'_'+col[-1:]
df = df.rename(columns=new_columns)

report = pandas_profiling.ProfileReport(df)
#report

In [3]:
# removing highly correlated data (alternative: PCA and what else?)
df2 = df.drop(report.get_rejected_variables(),axis=1).copy()
df2.head()


Unnamed: 0,class,0218_2,0218_5,0218_6,0423_2,0423_5,0423_6,0525_2,0525_5,0525_6,0728_2,0728_5,0829_2,0829_5,0930_2,0930_5,0930_6
0,B,579,1875,2468,888,2291,3191,1184,2755,3924,224,4503,259,4162,368,3221,1901
1,B,379,1910,2020,695,3396,2765,513,3262,2363,508,2992,450,1674,387,2129,2249
2,C,460,2688,2034,507,2237,1986,756,2981,2858,1103,3325,631,2570,588,2530,2441
3,D,422,2028,1865,442,2575,2388,444,2703,2391,1195,2598,532,2033,4599,5547,5143
4,C,121,1091,728,370,2155,1755,330,2269,1914,348,2053,327,1839,235,1534,1257


In [4]:
y = df2.values[:,0]
X = df2.values[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

df2.groupby('class').size()

class
A    131
B    270
C    761
D    296
E    185
F     37
G     10
H      4
dtype: int64

In [5]:
# Baseline
df2.groupby('class').size().max()/df2.shape[0]

0.449232585596222

In [6]:
def model_search(X, y):
    """
    Function built for convenience purposes. oversamplers, classifiers etc etc must be edited in the function itself,
    if necessary.
    """
    oversamplers = [
        ('none', None),
        ('smote', SMOTE()),
        ('gsmote', GeometricSMOTE())
    ]

    classifiers = [
        ('MLP', MLPClassifier(activation='logistic', 
                              solver='lbfgs', 
                              alpha=0.01,
                              max_iter=1000000,
                              verbose=True)),
        ('DT', DecisionTreeClassifier()),
        ('KNN', KNeighborsClassifier()),
        ('RF', RandomForestClassifier()),

    ]


    # missing parameters for DT, KNN and RF --> doing it later
    pre_params = {
        'smote': {'k_neighbours': [2, 3, 4, 5]},
        'MLP': {
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'hidden_layer_sizes': [(64,64), (150), (75), (150, 150)],
            'activation': ['relu', 'tanh', 'logistic'],
        },
        'gsmote': {
            'k_neighbors': [2, 3, 4, 5],
            'deformation_factor': [0.25, 0.50, 0.75],
            'truncation_factor': [-0.5, 0.0, 0.5]
        }
    }

    param_grids = []
    estimators = []
    for oversampler in oversamplers:
        for classifier in classifiers:
            # sets up pipeline with name
            name = f'{oversampler[0]}+{classifier[0]}'
            estimators.append((name, Pipeline([oversampler, classifier])))

            # sets up param grid for the estimator
            param_grid = {}
            if oversampler[0] in pre_params.keys(): 
                for key, value in pre_params[oversampler[0]].items():
                    param_grid[f'{name}__{oversampler[0]}__{key}'] = value

            if classifier[0]  in pre_params.keys(): 
                for key, value in pre_params[classifier[0]].items():
                    param_grid[f'{name}__{classifier[0]}__{key}'] = value

            param_grids.append(param_grid)


    #auc = make_scorer(roc_auc_score,  greater_is_better=True, average='micro')
    #acc = make_scorer(accuracy_score, greater_is_better=True, average='micro')
    #rec = make_scorer(recall_score,   greater_is_better=True, average='micro')

    model_search_cv = ModelSearchCV(
        estimators=estimators, 
        param_grids=param_grids, 
    #    scoring=[acc, rec, auc], 
        cv=StratifiedKFold(n_splits=5, shuffle=True),
        refit=False, 
        n_jobs=-1
    )

    model_search_cv.fit(X, y)

    return model_search_cv


In [7]:
model_search_cv = model_search(X,y)
model_search_cv



ModelSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
              error_score='raise',
              estimators=[('none+MLP',
                           Pipeline(memory=None,
                                    steps=[('none', None),
                                           ('MLP',
                                            MLPClassifier(activation='logistic',
                                                          alpha=0.01,
                                                          batch_size='auto',
                                                          beta_1=0.9,
                                                          beta_2=0.999,
                                                          early_stopping=False,
                                                          epsilon=1e-08,
                                                          hidden_layer_sizes=(100,),
                                                          learning_rate='constan

## Why is model_search_cv not including scores for different parameters?

In [9]:
report_model_search_results(model_search_cv)
#model_search_cv.best_score_

Unnamed: 0,models,params,mean_fit_time,mean_test_score
0,none+DT,{},0.049922,0.455726
1,smote+MLP,{},731.875886,0.40673
2,smote+RF,{},0.319202,0.489374
3,gsmote+DT,{},0.761707,0.403778
4,gsmote+KNN,{},0.628223,0.468123
5,smote+DT,{},0.195622,0.40732
6,none+RF,{},0.072104,0.541322
7,smote+KNN,{},0.075224,0.406139
8,none+KNN,{},0.010239,0.540732
9,gsmote+MLP,{},586.13492,0.397875


## Adopting a different procedure

Going to try to develop one vs all

In [10]:
remapper = lambda x, y: 'all' if x!=y else x
g = df2.groupby('class').size()
g.index = g.index.map(lambda x: remapper(x, 'C'))
g.groupby('class').sum()

class
C      761
all    933
dtype: int64

In [11]:
# baseline
g.groupby('class').sum().loc['all'] / g.sum()

0.5507674144037781

In [12]:
df3 = df2.copy()
df3['class2'] = df3['class'].map(lambda x: remapper(x, 'C'))

cols = list(df3.columns)
cols.remove('class')
cols.remove('class2')
y = df3['class2'].values
X = df3[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [13]:
model_search_cv2 = model_search(X,y)
report_model_search_results(model_search_cv2)

Unnamed: 0,models,params,mean_fit_time,mean_test_score
0,none+DT,{},0.026892,0.661747
1,smote+MLP,{},170.528188,0.735537
2,smote+RF,{},0.088411,0.745573
3,gsmote+DT,{},0.082608,0.675325
4,gsmote+KNN,{},0.059384,0.755018
5,smote+DT,{},0.048869,0.691854
6,none+RF,{},0.057513,0.75856
7,smote+KNN,{},0.02304,0.750295
8,none+KNN,{},0.003589,0.757969
9,gsmote+MLP,{},147.182878,0.730224


In [14]:
ndf3 = df3.drop(columns=['class2'])[df3['class']!='C'].copy()
y = ndf3['class'].values
X = ndf3[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.1)

In [15]:
ndf3.groupby(['class']).size().max() / ndf3.shape[0]

0.3172561629153269

In [16]:
model_search_cv2 = model_search(X,y)
report_model_search_results(model_search_cv2)



Unnamed: 0,models,params,mean_fit_time,mean_test_score
0,none+DT,{},0.013206,0.401929
1,smote+MLP,{},236.044768,0.30761
2,smote+RF,{},0.098747,0.446945
3,gsmote+DT,{},0.240358,0.365488
4,gsmote+KNN,{},0.207485,0.410504
5,smote+DT,{},0.06908,0.38478
6,none+RF,{},0.049109,0.49089
7,smote+KNN,{},0.039121,0.344051
8,none+KNN,{},0.002969,0.461951
9,gsmote+MLP,{},182.58524,0.287245
