In [None]:
from ds_libs import *
from ds_helper import *

## Import Train, Validation & Test sets

In [None]:
dir = "/Users/joshuaighalo/Documents/GitHub/eegDementia/MLOps/dementiaStages/dataframes/"
train = pd.read_csv(dir + "train_10.csv")
test_6m = pd.read_csv(dir + "test_6m_10.csv")
test_12m = pd.read_csv(dir + "test_12m_10.csv")

In [None]:
features_train = train.columns.values.tolist()
train.describe()

In [None]:
filenames_test_6m = test_6m['filenames'].values.tolist()
filenames_test_12m = test_12m['filenames'].values.tolist()

In [None]:
test_6m = test_6m.drop(['filenames'], axis=1)
test_12m = test_12m.drop(['filenames'], axis=1)

## Replace Categorical Classes with Binary Values

In [None]:
train = train.replace("ND", 0)
train = train.replace("MMD", 1)
train = train.replace("SD", 2)

## Check & Replace Null Values

In [None]:
train.fillna(train.mean(), inplace=True)
test_6m.fillna(test_6m.mean(), inplace=True)
test_12m.fillna(test_12m.mean(), inplace=True)

## Class Counter

In [None]:
sns.set(style="darkgrid")
ax = sns.countplot(x="Class", data=train, palette=sns.xkcd_palette(["azure", "light red","light green"]))
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()
train['Class'].value_counts()

## Apply SMOTE

In [None]:
sm = SMOTE(random_state=0)
xtrain, ytrain = sm.fit_resample(train[features_train[:-1]], train['Class'])

sns.set(style="darkgrid")
ax = sns.countplot(x=ytrain, palette=sns.xkcd_palette(["azure", "light red","light green"]))
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

## Seperate Features & Targets

In [None]:
xtest_6m,xtest_12m = test_6m, test_12m

## Normalization

In [None]:
scaler = MinMaxScaler()
xtrain_norm = scaler.fit_transform(xtrain)
xtest_6m_norm = scaler.transform(xtest_6m)
xtest_12m_norm = scaler.transform(xtest_12m)

## Standardization

In [None]:
scaler = StandardScaler()
xtrain_std = scaler.fit_transform(xtrain)
xtest_6m_std = scaler.transform(xtest_6m)
xtest_12m_std = scaler.transform(xtest_12m)

## PCA

In [None]:
pca = PCA(n_components=0.99,random_state=0)
xtrain_pca = pca.fit_transform(xtrain_norm)
xtest_6m_pca = pca.transform(xtest_6m_norm)
xtest_12m_pca = pca.transform(xtest_12m_norm)
print("No. PCA features train: ", xtrain_pca.shape[1])
print("No. PCA features test_6m: ", xtest_6m_pca.shape[1])
print("No. PCA features test_12m: ", xtest_12m_pca.shape[1])
indices = np.argsort(pca.explained_variance_ratio_)[::-1]
names = [features_train[i] for i in indices]
names.pop(0)
print("PCA features:",*names, sep = ", ")

# scatter plot of all principal components seperating the classes
fig, ax = plt.subplots(1, 1, figsize=(20,7))
ax.scatter(xtrain_pca[:, 0], xtrain_pca[:, 1], c=ytrain, cmap='rainbow', alpha=0.5, edgecolors='b')
ax.set_xlabel('First Principal Component')
ax.set_ylabel('Second Principal Component')
ax.set_title('PCA of EEG data')
plt.show()

## Linear Discriminant Analysis

In [None]:
lda = LDA()
xtrain_lda = lda.fit_transform(xtrain_std, ytrain)
xtest_6m_lda = lda.transform(xtest_6m_std)
xtest_12m_lda = lda.transform(xtest_12m_std)
print("No. LDA features train: ", xtrain_lda.shape[1])
print("No. LDA features test_6m: ", xtest_6m_lda.shape[1])
print("No. LDA features test_12m: ", xtest_12m_lda.shape[1])

# extract names of the features
indices = np.argsort(lda.explained_variance_ratio_)
names = [features_train[i] for i in indices]
names.pop(0)
print("LDA features:",*names, sep = ", ")

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(6, 4))
    for lab, col in zip((0, 1, 2),
                        ('blue', 'red', 'green')):
        plt.scatter(xtrain_lda[ytrain == lab, 0],
                    xtrain_lda[ytrain == lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Linear Discriminant 1')
    plt.ylabel('Linear Discriminant 2')
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.title('Train')
    plt.show()

## Tuning Models

In [None]:
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'gamma': [0, 0.25, 0.5, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1.0],
    'reg_lambda': [0, 0.25, 0.5, 0.75, 1.0]
}
xgb = XGBClassifier(objective='multi:softmax', num_class=3, random_state=0)
args = {'n_iter': 100, 'cv': 3, 'verbose': 0, 'random_state': 0, 'n_jobs': -1}

In [None]:
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, **args)
random_search.fit(xtrain, ytrain)
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)
print("Best estimator: ", random_search.best_estimator_)

In [None]:
random_search_norm = RandomizedSearchCV(xgb, param_distributions=param_grid, **args)
random_search_norm.fit(xtrain_norm, ytrain)
print("Best parameters: ", random_search_norm.best_params_)
print("Best score: ", random_search_norm.best_score_)
print("Best estimator: ", random_search_norm.best_estimator_)

In [None]:
random_search_std = RandomizedSearchCV(xgb, param_distributions=param_grid, **args)
random_search_std.fit(xtrain_std, ytrain)
print("Best parameters: ", random_search_std.best_params_)
print("Best score: ", random_search_std.best_score_)
print("Best estimator: ", random_search_std.best_estimator_)

In [None]:
random_search_pca = RandomizedSearchCV(xgb, param_distributions=param_grid, **args)
random_search_pca.fit(xtrain_pca, ytrain)
print("Best parameters: ", random_search_pca.best_params_)
print("Best score: ", random_search_pca.best_score_)
print("Best estimator: ", random_search_pca.best_estimator_)

In [None]:
random_search_lda = RandomizedSearchCV(xgb, param_distributions=param_grid, **args)
random_search_lda.fit(xtrain_lda, ytrain)
print("Best parameters: ", random_search_lda.best_params_)
print("Best score: ", random_search_lda.best_score_)
print("Best estimator: ", random_search_lda.best_estimator_)

## Validation 

<b> 1. Cross Validation

In [None]:
model = random_search.best_estimator_
loocv = LeaveOneOut()
loocv.get_n_splits(xtrain)
scores = cross_val_score(model, xtrain, ytrain, cv=loocv, scoring='accuracy').mean()
print("LOOCV score: ", scores)


In [None]:
model_norm = random_search_norm.best_estimator_
loocv = LeaveOneOut()
loocv.get_n_splits(xtrain_norm)
scores = cross_val_score(model_norm, xtrain_norm, ytrain, cv=loocv, scoring='accuracy').mean()
print("LOOCV score: ", scores)

In [None]:
model_std = random_search_std.best_estimator_
loocv = LeaveOneOut()
loocv.get_n_splits(xtrain_std)
scores = cross_val_score(model_std, xtrain_std, ytrain, cv=loocv, scoring='accuracy').mean()
print("LOOCV score: ", scores)

In [None]:
model_pca = random_search_pca.best_estimator_
loocv = LeaveOneOut()
loocv.get_n_splits(xtrain_pca)
scores = cross_val_score(model_pca, xtrain_pca, ytrain, cv=loocv, scoring='accuracy').mean()
print("LOOCV score: ", scores)

In [None]:
model_lda = random_search_lda.best_estimator_
loocv = LeaveOneOut()
loocv.get_n_splits(xtrain_lda)
scores = cross_val_score(model_lda, xtrain_lda, ytrain, cv=loocv, scoring='accuracy').mean()
print("LOOCV score: ", scores)

<b>2. Bootstrapping

In [None]:
rf = RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True)
rf_pca = RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True)
rf_norm = RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True)
rf_std = RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True)
rf_lda = RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True)
rf.fit(xtrain, ytrain)
rf_pca.fit(xtrain_pca, ytrain)
rf_norm.fit(xtrain_norm, ytrain)
rf_std.fit(xtrain_std, ytrain)
rf_lda.fit(xtrain_lda, ytrain)
print("Generalization Score:",rf.oob_score_)
print("Generalization Score PCA:",rf_pca.oob_score_)
print("Generalization Score Normalized:",rf_norm.oob_score_)
print("Generalization Score Standardized:",rf_std.oob_score_)
print("Generalization Score LDA:",rf_lda.oob_score_)
 

## Test 

In [None]:


model.fit(xtrain, ytrain)
ypred_6m = model.predict(xtest_6m)
ypred_12m = model.predict(xtest_12m)

model_norm.fit(xtrain_norm, ytrain)
ypred_6m_norm = model_norm.predict(xtest_6m_norm)
ypred_12m_norm = model_norm.predict(xtest_12m_norm)

model_std.fit(xtrain_std, ytrain)
ypred_6m_std = model_std.predict(xtest_6m_std)
ypred_12m_std = model_std.predict(xtest_12m_std)

model_pca.fit(xtrain_pca, ytrain)
ypred_6m_pca = model_pca.predict(xtest_6m_pca)
ypred_12m_pca = model_pca.predict(xtest_12m_pca)

model_lda.fit(xtrain_lda, ytrain)
ypred_6m_lda = model_lda.predict(xtest_6m_lda)
ypred_12m_lda = model_lda.predict(xtest_12m_lda)

In [None]:
indices_0 = np.where(ypred_6m_lda == 0)[0]
indices_1 = np.where(ypred_6m_lda == 1)[0]
indices_2 = np.where(ypred_6m_lda == 2)[0]
pred_ND_6m = np.array(filenames_test_6m)[indices_0]
pred_MMD_6m = np.array(filenames_test_6m)[indices_1]
pred_SD_6m = np.array(filenames_test_6m)[indices_2]

In [None]:
indices_0 = np.where(ypred_12m_lda == 0)[0]
indices_1 = np.where(ypred_12m_lda == 1)[0]
indices_2 = np.where(ypred_12m_lda == 2)[0]
pred_ND_12m = np.array(filenames_test_12m)[indices_0]
pred_MMD_12m = np.array(filenames_test_12m)[indices_1]
pred_SD_12m = np.array(filenames_test_12m)[indices_2]

## Ground Truth: ERPs Plots

In [None]:
dataPath = '/Users/joshuaighalo/Downloads/brainNet_datasets/laurel_place/cleaned_dataset/'
args = {'deviceVersion':1.0,'path':dataPath,'sfreq':cfg.fs,'line':cfg.line,'highPass':1,'lowPass':10,'stimTriggers':cfg.stimTrig,'clip':75,'channel_names':['Fz','Cz','Pz'],'ERPs_GrandAverages':True,'erp_plots':True}

In [None]:
print('No Dementia | 6 Months | Run 1')
erps6ND_1 = pipeline(filenames=pred_ND_6m,**args)

print('Mild Dementia | 6 Months | Run 1')
erps6MIDMOD_1 = pipeline(filenames=pred_MMD_6m,**args)

print('Severe Dementia | 6 Months | Run 1')
erps6SD_1 = pipeline(filenames=pred_SD_6m,**args)

In [None]:
print('No Dementia | 12 Months | Run 1')
erps12ND_1 = pipeline(filenames=pred_ND_12m,**args)

print('Mild Dementia | 12 Months | Run 1')
erps12MIDMOD_1 = pipeline(filenames=pred_MMD_12m,**args)

print('Severe Dementia | 12 Months | Run 1')
erps12SD_1 = pipeline(filenames=pred_SD_12m,**args)