In [None]:
# imports
from sklearn.model_selection import train_test_split
from moduleDatabase import DatabaseMethods
from moduleUtilities import UtilityMethods
from modulePreProcessing import TransformationMethods, FeatureMethods
from moduleModelTraining import TrainingMethods
from moduleMetrics import MetricsMethods

In [None]:
# class instances
db = DatabaseMethods()
ut = UtilityMethods()
tf = TransformationMethods()
fm = FeatureMethods()
train = TrainingMethods()
evaluate = MetricsMethods()

In [None]:
# connect to db and fetch data
df = db.fetch("SELECT * FROM v7")
df.head()

In [None]:
# inspect the distribution of potential target classes
ut.inspect_target_distribution(df)

In [None]:
# define target class (14 available, 6 moca, 6 mmse, 2 diffs)
df = db.separate_target_class(df, "moca_pre_binary_binned")

In [None]:
# handling outliers. ignoring outliers for user data.
# The whole process should avoid outliers in the first place.
df = tf.handle_outliers(df)

In [None]:
# Encoding of categorical to numerical. 
# No need for that particular encoding, since we fetch their ids from the view, unless we implement the Service, 
# then every transformation should be done in Python.

In [None]:
# Discretize 1) to calculate variance in same scale 2) because I need to convert any feature of float type to int early.
columns_to_discretize = ['age','avg_gr_time_win_gr_in_gs', 'avg_gr_time_win_gr_in_gs', 'avg_gr_time_in_gs', 
                         'total_win_gr_points_in_gs', 'total_gr_in_gs', 'total_success_rounds_in_session']
fm.discretize_features(df, columns_to_discretize)

In [None]:
# remove low variance features. VarX = p(1-p). Where p is the probability of a value of a feature.
df = fm.remove_low_variance_features(df, (.8 * (1 - .8)), ddof_val=1)

In [None]:
# scaling
# TODO try to use StandardScaler for user data.
columnsToIgnore = ['userId', 'gsId', 'gsStartTime', 'target_class']
df = tf.use_min_max(df, columnsToIgnore)

In [None]:
# feature correlation inspection. Select features correlated to target class not between them.
fs = ['age', 'education','laptop_usage', 'smartphone_usage', 'family_med_history', 'exercising', 'marital_status_1',
      'marital_status_3', 'hypertension', 'total_gr_in_gs', 'total_success_rounds_in_session', 'total_win_gr_points_in_gs', 
      'avg_gr_time_in_gs', 'avg_gr_time_win_gr_in_gs', 'target_class']
fm.correlation_inspection(df, fs)

In [None]:
all_features = ['age', 'education','laptop_usage', 'smartphone_usage', 'family_med_history', 'exercising',
                'marital_status_1', 'marital_status_3', 'hypertension', 
                'total_gr_in_gs', 'total_success_rounds_in_session', 'total_win_gr_points_in_gs', 
                'avg_gr_time_in_gs', 'avg_gr_time_win_gr_in_gs']
cl1_features = ['laptop_usage', 'age', 'avg_gr_time_win_gr_in_gs', 'education', 'avg_gr_time_in_gs']
cl2_features = ['marital_status_3', 'family_med_history']
cl3_features = ['exercising', 'smartphone_usage', 'total_win_gr_points_in_gs', 'total_success_rounds_in_session',
                'total_gr_in_gs', 'marital_status_1', 'hypertension']
session_features = ['total_gr_in_gs', 'total_success_rounds_in_session', 'total_win_gr_points_in_gs', 
                    'avg_gr_time_in_gs', 'avg_gr_time_win_gr_in_gs']

In [None]:
# feature importance inspection using a classifier
# MDI feature importance and feature values permutation importance

fm.inspection_using_classifier(df, all_features)
fm.inspection_using_classifier(df, cl1_features)
fm.inspection_using_classifier(df, cl2_features)
fm.inspection_using_classifier(df, cl3_features)
fm.inspection_using_classifier(df, session_features)

In [None]:
# Feature importance inspection using Univariate Feature Selection
# More specifically, ANOVA and some traditional Regressors
# TODO check also for the target as continuous  

fm.inspection_using_regressors(df, all_features)
fm.inspection_using_regressors(df, cl1_features)
fm.inspection_using_regressors(df, cl2_features)
fm.inspection_using_regressors(df, cl3_features)
fm.inspection_using_regressors(df, session_features)

In [None]:
test_size = 0.25
cross_val_num=3

In [None]:
# split dataframe samples for the training and evaluation process
selected_features = ['education', 'avg_gr_time_in_gs',
                     'family_med_history',
                     'total_gr_in_gs']
x = df[selected_features]
y = df.iloc[:, df.columns.get_loc('target_class')]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=7, test_size=test_size)
print('x:',x.shape,' y:',y.shape)
print('x_train:',x_train.shape,' x_test:',x_test.shape,' y_train:',y_train.shape,'y_test:',y_train.shape)

In [None]:
# apply oversampling
from imblearn.over_sampling import SMOTE
print("Before SMOTE OverSampling. NC: {}".format(sum(y_train==2)))
print("Before SMOTE OverSampling. AD-MCI: {} \n".format(sum(y_train==1)))

smote_tf = SMOTE(random_state=2)
smote_x, smote_y = smote_tf.fit_sample(x_train, y_train.ravel())
smote_train_x, smote_test_x, smote_train_y, smote_test_y = train_test_split(smote_x, smote_y, random_state=7, test_size=test_size)

print('smote_x:',smote_x.shape,' smote_y:',smote_y.shape)
print('smote_train_x:',smote_train_x.shape,' smote_test_x:',smote_test_x.shape,' smote_train_y:',smote_train_y.shape,' smote_test_y:',smote_test_y.shape, '\n')

print("Before SMOTE OverSampling. NC: {}".format(sum(smote_y==2)))
print("Before SMOTE OverSampling. AD-MCI: {} \n".format(sum(smote_y==1)))

In [None]:
# train models before oversampling
trained_models = train.train_models(x_train, y_train, x_test, y_test)

In [None]:
# evaluate models before oversampling
evaluate.generate_metrics(trained_models, x_test, y_test, ['accuracy', 'precision', 'recall', 'f1'], 
                          cv_num=cross_val_num, show_raw_data=False)

In [None]:
# train models after oversampling
trained_models_smote = train.train_models(smote_train_x, smote_train_y, smote_test_x, smote_test_y)

In [None]:
# evaluate models after sampling
evaluate.generate_metrics(trained_models_smote, smote_test_x, smote_test_y, ['accuracy', 'precision', 'recall', 'f1'], 
                          cv_num=cross_val_num, show_raw_data=False)

In [None]:
# apply dimensionality reduction using PCA
# example https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_lda.html
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_x = pca.fit(smote_x).transform(smote_x)
print('pca_x:',pca_x.shape)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s' % str(pca.explained_variance_ratio_))

plt.figure()
colors = ['red', 'blue']
lw = 2
for color, i, target_name in zip(colors, [1, 2], ['AD-MCI','NC']):
    plt.scatter(pca_x[smote_y == i, 0], pca_x[smote_y == i, 1], color=color, alpha=.8, lw=lw, label=target_name)
    
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of data set')
plt.show();

In [None]:
# split of pca_x and smote_y to train test data set
pca_train_x, pca_test_x, pca_train_y, pca_test_y = train_test_split(pca_x, smote_y, random_state=7, test_size=test_size)

In [None]:
# train and evaluate models using the 2 components from PCA
trained_models_using_pca_comp = train.train_models(pca_train_x, smote_train_y, smote_test_x, smote_test_y)
evaluate.generate_metrics(trained_models_using_pca_comp, pca_test_x, pca_test_y, 
                          ['accuracy', 'precision', 'recall', 'f1'], cv_num=cross_val_num, show_raw_data=False)

In [None]:
# apply dimensionality reduction using LDA

# creating two components separately due to the know restriction n_components cannot be larger than min(n_features, n_classes - 1)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)

# print(smote_x.iloc[:, [0,1,2,3]])
# education  avg_gr_time_in_gs  family_med_history  total_gr_in_gs
# Selecting (avg_gr_time_in_gs, family_med_history) and (education, total_gr_in_gs) based on Spearman correlation coefficient

# print('lda_x_one from:',smote_x.iloc[:, [0,3]].shape)
lda_x_one = lda.fit_transform(smote_x.iloc[:, [0,3]], smote_y)
# print('lda_x_one:', lda_x_one.shape)

# print('lda_x_two from:',smote_x.iloc[:, [1,2]].shape)
lda_x_two = lda.fit_transform(smote_x.iloc[:, [1,2]], smote_y)
# print('lda_x_two:', lda_x_two.shape)

lda_x = np.concatenate((lda_x_one, lda_x_two),axis=1)
# print('lda_x:',lda_x.shape)

plt.figure()
colors = ['red', 'blue']
lw = 2
for color, i, target_name in zip(colors, [1, 2], ['AD-MCI','NC']):
    plt.scatter(lda_x[smote_y == i, 0], lda_x[smote_y == i, 1], color=color, alpha=.8, lw=lw, label=target_name)
    
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('LDA of data set')
plt.show();

In [None]:
# split of pca_x and smote_y to train test data set
lda_train_x, lda_test_x, lda_train_y, lda_test_y = train_test_split(lda_x, smote_y, random_state=7, test_size=test_size)

In [None]:
# train and evaluate models using the 2 components from LDA
trained_models_using_lda = train.train_models(lda_train_x, smote_train_y, smote_test_x, smote_test_y)
evaluate.generate_metrics(trained_models_using_lda, lda_test_x, lda_test_y, 
                          ['accuracy', 'precision', 'recall', 'f1'], cv_num=cross_val_num, show_raw_data=False)