In [1]:
# imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from moduleLoading import LoadingMethods
from modulePreProcessing import ScalingMethods, FeatureMethods, boxplot_features
from moduleModelTraining import TrainingMethods
from moduleMetrics import MetricsMethods
from sklearn.preprocessing import OneHotEncoder, StandardScaler, KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from collections import Counter
from joblib import dump
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from sklearn.ensemble import RandomForestClassifier

In [2]:
# open Jupyter QtConsole
%qtconsole

In [3]:
# class instances
load = LoadingMethods()
scale = ScalingMethods()
fm = FeatureMethods()
train = TrainingMethods()
evaluate = MetricsMethods()

IndentationError: unexpected indent (<ipython-input-3-9ca66c7bcfe3>, line 4)

In [None]:
# final features
ff = []

In [None]:
# connect to db and fetch data
df = load.connect_and_fetch("127.0.0.1", "mci_db", "root", "toor", "SELECT * FROM v7")
df.head()

In [None]:
# define target class (14 available, 6 moca, 6 mmse, 2 diffs)
target_class = "moca_pre_binary_binned"
df = load.separate_target_class(df, target_class)
df.head()

In [None]:
# discretize !!! also totals and avgs? check!
discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')
dfPartToDiscretize = df['age']
df['age'] = discretizer.fit_transform(df[['age']])
df.head()

In [None]:
# encoding of categorical to numerical. 
# no need for that particular encoding,
# since we fetch their ids from the view.

In [None]:
# remove low variance features
df = fm.remove_low_variance_features(df, (.8 * (1 - .8)))

In [None]:
# handle outliers
df = scale.handle_outliers(df)

In [None]:
# scaling
columnsToIgnore = ['gsId', 'gsStartTime', 'target_class']
df = scale.use_min_max(df, columnsToIgnore)

In [None]:
# feature importance inspection using a classifier
# MDI feature importance and feature values permutation importance

# user related independent variables
features_to_inspect = ['age', 'education','laptop_usage', 'smartphone_usage', 
                       'family_med_history', 'exercising', 'marital_status_1', 
                       'marital_status_3', 'hypertension']
fm.inspection_using_classifier(df, features_to_inspect)

# session data related independent variables
features_to_inspect = ['total_gr_in_gs', 'total_success_rounds_in_session', 
                       'total_win_gr_points_in_gs', 'avg_gr_time_in_gs', 
                       'avg_gr_time_win_gr_in_gs']

fm.inspection_using_classifier(df, features_to_inspect)

In [None]:
# feature correlation inspection

# example
# https://scikit-learn.org/stable/auto_examples/inspection/plot_permutation_importance_multicollinear.html#handling-multicollinear-features
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
X = df[['age', 'education','laptop_usage', 'smartphone_usage', 
        'family_med_history', 'exercising', 'marital_status_1', 
        'marital_status_3', 'hypertension', 
        'total_gr_in_gs', 'total_success_rounds_in_session', 
        'total_win_gr_points_in_gs', 'avg_gr_time_in_gs', 
        'avg_gr_time_win_gr_in_gs']]
corr = spearmanr(X).correlation
corr_linkage = hierarchy.ward(corr)
feature_names = X.columns.tolist()
dendro = hierarchy.dendrogram(corr_linkage, labels=feature_names, ax=ax1, leaf_rotation=0, orientation='right')
dendro_idx = np.arange(0, len(dendro['ivl']))

ax2.imshow(corr[dendro['leaves'], :][:, dendro['leaves']])
ax2.set_xticks(dendro_idx)
ax2.set_yticks(dendro_idx)
ax2.set_xticklabels(dendro['ivl'], rotation='vertical')
ax2.set_yticklabels(dendro['ivl'])
fig.tight_layout()
plt.show()

In [None]:
# https://nbviewer.jupyter.org/github/justmarkham/scikit-learn-tips/blob/master/notebooks/23_linear_model_coefficients.ipynb
#https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py

# plt.figure(1)
# plt.clf()
# X_indices = np.arange(X.shape[-1])
# selector = SelectKBest(f_classif, k=4)
# selector.fit(X_train, y_train)
# scores = -np.log10(selector.pvalues_)
# scores /= scores.max()
# plt.bar(X_indices - .45, scores, width=.2,
#         label=r'Univariate score ($-Log(p_{value})$)')
# plt.title("Comparing feature selection")
# plt.xlabel('Feature number')
# plt.yticks(())
# plt.axis('tight')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# feature selection for user demographics, medical profile, technology familiarity related features

# independent variables
X = df[['age', 'education','laptop_usage', 'smartphone_usage', 
         'family_med_history', 'exercising', 'marital_status_1', 
         'marital_status_3', 'hypertension']]

# target class
targetClassIndex = df.columns.get_loc('target_class')
y = df.iloc[:, targetClassIndex]

# feature selection
selector = SelectKBest(chi2, k=2)
Xresults = selector.fit_transform(X.values, y.values)
# selector = check for coeff in selector ???

# print(Xresults.shape)
# print(type(Xresults))
#get selected as: array([3, 4], dtype=int64)
selectedFeaturesIndices = selector.get_support(indices=True)
# print('selectedFeaturesIndices ', selectedFeaturesIndices)

#get selected as: Index(['age', 'education'], dtype='object')
selectedFeaturesIndicesNames = X.columns[selectedFeaturesIndices.tolist()]
# print('selectedFeaturesIndicesNames ', selectedFeaturesIndicesNames)

#get selected as list: ['age', 'education']
selectedFeatures = X.columns[selectedFeaturesIndices.tolist()].values.tolist()
print(selectedFeatures)
ff += selectedFeatures

In [None]:
# feature selection for MCI Rehab recorded game performance related features

# independent variables
X = df[['total_gr_in_gs', 'total_success_rounds_in_session', 
        'total_win_gr_points_in_gs', 'avg_gr_time_in_gs', 
        'avg_gr_time_win_gr_in_gs']]

# target class
targetClassIndex = df.columns.get_loc('target_class')
y = df.iloc[:, targetClassIndex]

# feature selection
selector = SelectKBest(chi2, k=2)
Xresults = selector.fit_transform(X.values, y.values)
# print(Xresults.shape)
# print(type(Xresults))
#get selected as: array([3, 4], dtype=int64)
selectedFeaturesIndices = selector.get_support(indices=True)
# print('selectedFeaturesIndices ', selectedFeaturesIndices)

#get selected as: Index(['age', 'education'], dtype='object')
selectedFeaturesIndicesNames = X.columns[selectedFeaturesIndices.tolist()]
# print('selectedFeaturesIndicesNames ', selectedFeaturesIndicesNames)

#get selected as list: ['age', 'education']
selectedFeatures = X.columns[selectedFeaturesIndices.tolist()].values.tolist()
print(selectedFeatures)
ff += selectedFeatures

In [None]:
# train models

# Either shuffle=True, random_state=7 Or stratify=y
X_train, X_test, y_train, y_test = train_test_split(df[ff], y, test_size=0.3, shuffle=False)

clfs = {
    'lr': LogisticRegression(random_state=7),
    'dt' : DecisionTreeClassifier(max_depth=4),
    'rf' : RandomForestClassifier(max_depth=4, random_state=7)
}
custom_ensemble = VotingClassifier([('clf1', clfs.get('lr')), 
                                    ('clf2', clfs.get('dt')), 
                                    ('clf3', clfs.get('rf'))], voting='soft')
clfs['ce'] = custom_ensemble

trained_models = train.train_models(clfs, X_train, y_train)

In [None]:
# evaluate models
evaluate.generate_metrics(trained_models, X_test, y_test, ['accuracy', 'precision', 'recall', 'f1'], show_raw_data=False)