In [None]:
import pandas as pd
from joblib import load
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import GroupKFold
from src.features.modified_feature_selector import FeatureSelector


In [19]:
# grid_results = pd.read_hdf('models/logreg_gridsearch_results_v2.h5')
grid_model = load("models/rf_gridsearch_ach-at-hex_6000_4_4_.joblib")

In [20]:
grid_results = pd.DataFrame(grid_model.cv_results_)
best = grid_results.iloc[grid_model.best_index_,:]
best.mean_test_Accuracy

0.5345679012345679

In [None]:
# get the avg score for the splits on subjects where AT was applied
at_columns = grid_results.columns[grid_results.columns.str.match('split[0-4]_test_F1-score')]
grid_results['mean_AT_F1-score'] = grid_results[at_columns].mean(axis=1)

In [None]:
# visualise the best penalty
fg = sns.FacetGrid(data = grid_results, hue='param_clf__penalty', aspect=1.61, height=5)
fg.map(plt.semilogx, 'param_clf__C', 'mean_test_F1-score').add_legend(title='Penalty')
fg.set(xlabel='Inverse Regularisation Strength (C)', ylabel='Mean F1-Score', title='Tuning the Logistic Regression Model (n_quantiles = 10)')
# fg.savefig('reports/figures/logreg_grid_penalty_qt_10_AUC')

In [None]:
grid_results.sort_values('mean_AT_F1-score', ascending=False).head()

In [None]:
# refit the model using best params
# do one for l2 and one for l1
X = pd.read_hdf('data/features/filtered/filtered_0.05_.h5')
y = pd.read_hdf('data/ach_at_combined_y.h5', key='y')

s = 1
for i in range(1,6):
    X.loc[90*(i-1):90*i,'subject'] = s
    s += 1
for i in range(1,7):
    X.loc[450+(60*(i-1)):450+(60*i),'subject'] = s
    s+=1
assert not any(pd.isna(X['subject']))

# USE SUBJECT 4 AS A TEST
train = X[X['subject'] != 4].index
test = X[X['subject'] == 4].index

X_train, X_test, y_train, y_test = X.iloc[train,:], X.iloc[test,:], y.iloc[train], y.iloc[test]

In [None]:
best_l1_model = Pipeline(steps=[('qt',
                 QuantileTransformer(copy=True, ignore_implicit_zeros=False,
                                     n_quantiles=10,
                                     output_distribution='normal',
                                     random_state=None, subsample=100000)),
                ('clf',
                 LogisticRegression(C=0.46415888336127775, class_weight=None,
                                    dual=False, fit_intercept=True,
                                    intercept_scaling=1, l1_ratio=None,
                                    max_iter=10000, multi_class='auto',
                                    n_jobs=None, penalty='l1',
                                    random_state=None, solver='saga',
                                    tol=0.0001, verbose=0, warm_start=False))],
         verbose=False)
best_l1_model.fit(X_train, y_train)

In [None]:
y_pred = best_l1_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [None]:
splits = []
for i in range(0,11):
    split = best[[col for col in best if col.startswith('split'+str(i)+'_test')]]
    split.columns = split.columns.str.lstrip('split'+str(i)+'_test_')
    split.index = [i]
    splits.append(split)

scores = pd.concat(splits)
scores['test_subject'] = scores.index+1
sns.set_style(style="whitegrid")
scores2 = pd.melt(scores, id_vars='test_subject', value_vars=['F1-score'],
var_name='metric',value_name='score')
g = sns.catplot(
    data = scores2, kind = 'bar',
    x = 'test_subject', y='score', hue='metric',
    ci="sd", palette="dark", alpha=.6, height=6
)
plt.show()

In [None]:
# try removing some of the subjects
# WORST - 2,8,1



In [None]:
sub = pd.read_hdf('data/processed/subject_6000.h5')
X = pd.read_hdf('data/features/ach-at-hex_6000_eff_combined.h5')
y= pd.read_hdf('data/processed/y_3_class_6000.h5')

In [None]:
sub = sub.reset_index(drop=True)
y = y.reset_index(drop=True)

In [None]:
train = sub[(sub != '02_0315_ach-at') & (sub != '06_0201_ach-hex')].index
test = sub[(sub == '02_0315_ach-at') | (sub == '06_0201_ach-hex')].index

In [None]:
X_train, X_test, y_train, y_test = X.iloc[train,:], X.iloc[test,:], y[train], y[test]

In [None]:
gkf = GroupKFold(n_splits = len(sub[train].unique()))
gkf = list(gkf.split(X_train, y_train, sub[train]))


In [None]:
sub[train]

In [None]:
test = X.iloc[:,0:2]
test['subject'] = X['subject']

In [None]:
y3 = pd.read_hdf('data/processed/y_4_class_10000.h5')

In [None]:
y3.unique()

In [None]:
fs = FeatureSelector(multiclass=True, n_significant=3)
fs.fit(test,y)

In [None]:
test2 = fs.transform(test)