# LUCAS Dataset imbalanced class prediction problem

## Imports

In [None]:
import pandas as pd
import sqlite3
import pandas_profiling
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import StandardScaler

# oversamplers
from sklearnext.over_sampling import SMOTE, GeometricSMOTE, RandomOverSampler

# classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# utils
from sklearnext.model_selection import ModelSearchCV
from sklearnext.tools import report_model_search_results
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold


## Hyperparams

In [None]:
%config InlineBackend.figure_format = 'retina'

correlation_threshold = 0.9

## Reading pre-treated data

In [None]:
data_version = 'wide'

if data_version == 'long':
    conn = sqlite3.connect('../data/interim/remote_sensing_data.db')
    df = pd.read_sql_query('SELECT * FROM lucas', conn)
elif data_version == 'wide':
    df = pd.read_csv('../data/interim/data.csv')
    df = df[list(df.columns)[1:]+['class']]


## General data exploration

In [None]:
df.groupby('class').size()

![class labels](https://ec.europa.eu/eurostat/statistics-explained/images/9/99/LUCAS_-_classification_of_land_cover.png)

In [None]:
# Degree of Dimensionality
DoD = lambda sample_size, features: sample_size/features
ft = len(df.columns)-1 # -1 is used to disregard the column "class"
df.groupby('class').size().apply(lambda sample_size: DoD(sample_size, ft))

In [None]:
# Imbalance Ratio
IR = lambda majority_class, class_label: majority_class/class_label
majority_class = df.groupby('class').size().max()
df.groupby('class').size().apply(lambda class_size: IR(majority_class, class_size))

In [None]:
report = pandas_profiling.ProfileReport(df, correlation_threshold=correlation_threshold)
report

In [None]:
def make_corr_table(df, method='spearman', fig_size=(15,15)):
    corr = df.corr(method=method)
    # remove the upper diagonal of the correlation matrix
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True

    with sns.axes_style("white"):
        fig, ax = plt.subplots()
        fig.set_size_inches(fig_size[0], fig_size[1])
        sns.heatmap(corr, mask=mask, vmin=-1, vmax=1, square=True, ax=ax, cmap='RdBu_r')
    
    
make_corr_table(df)

## Preprocessing stage

In [None]:
# See notebook 2.0 if further preprocessing is necessary
sc = StandardScaler()
_X = df.values[:,:-1]
X = sc.fit_transform(_X)
y = df.values[:,-1]






## Modelling

In [None]:
def model_search(X, y, approach='standard'):
    """
    Function built for convenience purposes. oversamplers, classifiers etc etc must be edited in the function itself,
    if necessary.
    """
    global oversamplers, classifiers, grid, param_grids, estimators
    
    configs = {
        'scoring': ['f1_weighted', 'accuracy'],
        'n_splits': 5,
        'n_runs': 3,
        'random_state': 0,
        'n_jobs': -1,
        'verbose':1
    }
    
    
    oversamplers = [
        ('none', None),
        #('RandomOverSampler', RandomOverSampler()),
        #('smote', SMOTE()),
        ('gsmote', GeometricSMOTE())
    ]

    classifiers = [
        #('GBC', GradientBoostingClassifier()),
        ('DT', DecisionTreeClassifier()),
        ('KNN', KNeighborsClassifier()),
        #('LR', LogisticRegression(solver='lbfgs', penalty='l2', max_iter=1e4)),
    ]


    grid = {
        #'smote': {'k_neighbors': [2, 3, 4, 5]},
        'gsmote': {
            'k_neighbors': [2, 3, 4, 5],
            #'truncation_factor': [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0], 
            #'deformation_factor': [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0],
            #'selection_strategy': ['combined', 'minority', 'majority']
        },
        'DT':{'max_depth': [3, 6]},
        'KNN':{'n_neighbors':[3,4,5,6,7]},
        #'GBC':{
        #    'max_depth': [3, 6], 
        #    'n_estimators': [50, 100]
        #}
    }

    param_grids = []
    estimators = []
    
    for oversampler in oversamplers:
        for classifier in classifiers:

            # sets up pipeline with name
            name = f'{oversampler[0]}+{classifier[0]}'
            if approach == 'standard':
                estimators.append((name, Pipeline([oversampler, classifier])))
            elif approach == 'onevsrest':
                estimators.append((name, OneVsRestClassifier(Pipeline([oversampler, classifier]))))
            elif approach == 'onevsone':
                estimators.append((name, OneVsOneClassifier(Pipeline([oversampler, classifier]))))
                

            # sets up param grid for the estimator
            param_grid = {}
            if oversampler[0] in grid.keys(): 
                for key, value in grid[oversampler[0]].items():
                    if approach == 'standard':
                        param_grid[f'{name}__{oversampler[0]}__{key}'] = value
                    elif approach in ['onevsrest', 'onevsone']:
                        param_grid[f'{name}__estimator__{oversampler[0]}__{key}'] = value

            if classifier[0]  in grid.keys(): 
                for key, value in grid[classifier[0]].items():
                    if approach == 'standard':
                        param_grid[f'{name}__{classifier[0]}__{key}'] = value
                    elif approach in ['onevsrest', 'onevsone']:
                        param_grid[f'{name}__estimator__{classifier[0]}__{key}'] = value
            if len(param_grid)>0:
                param_grids.append(param_grid)


    model_search_cv = ModelSearchCV(
        estimators=estimators, 
        param_grids=param_grids, 
        scoring=configs['scoring'], 
        cv=StratifiedKFold(n_splits=configs['n_splits'], shuffle=True),
        refit=False, 
        n_jobs=configs['n_jobs'],
        verbose=configs['verbose']
    )

    model_search_cv.fit(X, y)

    return model_search_cv

In [None]:
param_grids



## Experiments

In [None]:

approaches = ['standard', 'onevsrest', 'onevsone']
results = {}
for approach in approaches:
    model_search_cv = model_search(X, y, approach=approach)
    results[approach] = report_model_search_results(model_search_cv)

cols = list(list(results.values())[0].columns)
cols.append('method')
df_results = pd.DataFrame(columns=cols)

for key, df_pre_results in results.items():
    df_pre_results['method'] = key
    df_results = pd.concat([df_results, df_pre_results])

df_results['oversampler'] = df_results['models'].apply(lambda x: x.split('+')[0])
df_results['classifier'] = df_results['models'].apply(lambda x: x.split('+')[1])
df_results.shape