In [None]:
import pandas as pd
import pandas_profiling
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, make_scorer

from sklearnext.over_sampling import SMOTE, GeometricSMOTE #scikit-learn extensions from IMS-ML-Lab
from sklearnext.model_selection import ModelSearchCV
from sklearnext.tools import report_model_search_results

from imblearn.pipeline import Pipeline

from collections import Counter

In [None]:
df = pd.read_csv('tabexport2.csv', delimiter=';')

new_columns = {}
for col in df.columns[1:]:
    new_columns[col] = col.replace('LC08_L1TP_204032_2015', '')[:4]+'_'+col[-1:]
df = df.rename(columns=new_columns)

report = pandas_profiling.ProfileReport(df)
#report

In [None]:
# removing highly correlated data (alternative: PCA and what else?)
df2 = df.drop(report.get_rejected_variables(),axis=1).copy()
df2.head()


In [None]:
y = df2.values[:,0]
X = df2.values[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

df2.groupby('class').size()

In [None]:
# Baseline
df2.groupby('class').size().max()/df2.shape[0]

In [None]:
def model_search(X, y):
    """
    Function built for convenience purposes. oversamplers, classifiers etc etc must be edited in the function itself,
    if necessary.
    """
    oversamplers = [
        ('none', None),
        ('smote', SMOTE()),
        ('gsmote', GeometricSMOTE())
    ]

    classifiers = [
        ('MLP', MLPClassifier(activation='logistic', 
                              solver='lbfgs', 
                              alpha=0.01,
                              max_iter=1000000,
                              verbose=True)),
        ('DT', DecisionTreeClassifier()),
        ('KNN', KNeighborsClassifier()),
        ('RF', RandomForestClassifier()),

    ]


    # missing parameters for DT, KNN and RF --> doing it later
    pre_params = {
        'smote': {'k_neighbours': [2, 3, 4, 5]},
        'MLP': {
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'hidden_layer_sizes': [(64,64), (150), (75), (150, 150)],
            'activation': ['relu', 'tanh', 'logistic'],
        },
        'gsmote': {
            'k_neighbors': [2, 3, 4, 5],
            'deformation_factor': [0.25, 0.50, 0.75],
            'truncation_factor': [-0.5, 0.0, 0.5]
        }
    }

    param_grids = []
    estimators = []
    for oversampler in oversamplers:
        for classifier in classifiers:
            # sets up pipeline with name
            name = f'{oversampler[0]}+{classifier[0]}'
            estimators.append((name, Pipeline([oversampler, classifier])))

            # sets up param grid for the estimator
            param_grid = {}
            if oversampler[0] in pre_params.keys(): 
                for key, value in pre_params[oversampler[0]].items():
                    param_grid[f'{name}__{oversampler[0]}__{key}'] = value

            if classifier[0]  in pre_params.keys(): 
                for key, value in pre_params[classifier[0]].items():
                    param_grid[f'{name}__{classifier[0]}__{key}'] = value

            param_grids.append(param_grid)


    #auc = make_scorer(roc_auc_score,  greater_is_better=True, average='micro')
    #acc = make_scorer(accuracy_score, greater_is_better=True, average='micro')
    #rec = make_scorer(recall_score,   greater_is_better=True, average='micro')

    model_search_cv = ModelSearchCV(
        estimators=estimators, 
        param_grids=param_grids, 
    #    scoring=[acc, rec, auc], 
        cv=StratifiedKFold(n_splits=5, shuffle=True),
        refit=False, 
        n_jobs=-1
    )

    model_search_cv.fit(X, y)

    return model_search_cv


In [None]:
model_search_cv = model_search(X,y)
model_search_cv

## Why is model_search_cv not including scores for different parameters?

In [None]:
report_model_search_results(model_search_cv)
#model_search_cv.best_score_

## Adopting a different procedure

Going to try to develop one vs all

In [None]:
remapper = lambda x, y: 'all' if x!=y else x
g = df2.groupby('class').size()
g.index = g.index.map(lambda x: remapper(x, 'C'))
g.groupby('class').sum()

In [None]:
# baseline
g.groupby('class').sum().loc['all'] / g.sum()

In [None]:
df3 = df2.copy()
df3['class2'] = df3['class'].map(lambda x: remapper(x, 'C'))

cols = list(df3.columns)
cols.remove('class')
cols.remove('class2')
y = df3['class2'].values
X = df3[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [None]:
model_search_cv2 = model_search(X,y)
report_model_search_results(model_search_cv2)

In [None]:
ndf3 = df3.drop(columns=['class2'])[df3['class']!='C'].copy()
y = ndf3['class'].values
X = ndf3[cols].values
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.1)

In [None]:
ndf3.groupby(['class']).size().max() / ndf3.shape[0]

In [None]:
model_search_cv2 = model_search(X,y)
report_model_search_results(model_search_cv2)