In [None]:
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns
import joblib
import dill
import dask_ml
import dask.dataframe as dd

from importlib import reload
from scipy import signal, stats
from tqdm.auto import tqdm
from sklearn import neighbors, linear_model, ensemble, decomposition #svm, neural_network
from sklearn import feature_selection, model_selection, metrics, dummy, pipeline, preprocessing, compose
from dask_ml.model_selection import RandomizedSearchCV
from matplotlib import pyplot as plt
from src import main, feature_model
from itertools import product

In [None]:
from dask_jobqueue import SLURMCluster
from distributed import Client, LocalCluster

In [None]:
try:
    cluster.close()
    client.close()
except NameError:
    pass
finally:
    cluster = SLURMCluster(queue='short', cores=4, memory='8gb', walltime='1:00:00', death_timeout=60)
    client = Client(cluster)
    cluster.adapt(minimum=1, maximum=50)

In [None]:
dataset = 'real_phone-tsfeatures'
features_df = dd.read_parquet(f'/home/hy180/projects/beat_pd/extracted_features/{dataset}.parquet')

label_cols = ['on_off', 'dyskinesia', 'tremor', 'subject_id']
labels = pd.concat([
    pd.read_csv('/home/hy180/projects/beat_pd/data/cis-pd/data_labels/CIS-PD_Training_Data_IDs_Labels.csv'),
    pd.read_csv('/home/hy180/projects/beat_pd/data/real-pd/data_labels/REAL-PD_Training_Data_IDs_Labels.csv'),
], axis=0).astype({'subject_id': str})

# These features don't compute for a number of observations
drop_cols = ['rms__friedrich_coefficients__m_3__r_30__coeff_0',
       'rms__friedrich_coefficients__m_3__r_30__coeff_1',
       'rms__friedrich_coefficients__m_3__r_30__coeff_2',
       'rms__friedrich_coefficients__m_3__r_30__coeff_3',
       'rms__max_langevin_fixed_point__m_3__r_30']
# These fft features are null for our size of windows
null_fft_cols = ['rms__fft_coefficient__coeff_%d__attr_"%s"' % (n, s) 
                     for n, s in product(range(51, 100), ['abs', 'angle', 'imag', 'real'])]
# Sample entropy can take inf which screws with models
inf_cols = ['rms__sample_entropy']
df = features_df.drop(columns=[*drop_cols, *null_fft_cols, *inf_cols]).merge(labels, right_on='measurement_id', left_on='samp_id')
# df = df.persist()

# Model

In [None]:
scaler = preprocessing.RobustScaler(quantile_range=(1, 99))
scaler_pg = {'scaler__quantile_range': [(.1, 99.9), (.5, 99.5), (1, 99), (5, 95), (10, 90)],}
# scaler = preprocessing.MinMaxScaler()

# Keep features w/ variance in top 95%ile 
var = lambda X, y: np.var(X, axis=0)
f_select = feature_selection.SelectPercentile(var, percentile=95)
# f_select_pg = {'f_select__percentile': [95, 80, 50, 25, 10],}
f_select_pg = {'f_select__percentile': stats.uniform(0, 100)}
# f_select = feature_selection.SelectKBest(feature_selection.mutual_info_regression, k=30)

# model = linear_model.Ridge()
# model_pg = {'model__regressor__alpha': [0.1, 0.5, 1, 2, 5],}
# model = svm.SVR()
# model_pg = {'model__regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'model__regressor__C': stats.chi2(df=2)}
# model = linear_model.ElasticNet()
# model_pg = {'model__regressor__l1_ratio': stats.uniform(0, 1), 'model__regressor__alpha': stats.chi2(df=2), }
# model_pg = {'model__regressor__l1_ratio': [0.01, 0.1, 0.5, 0.8, 0.99], 'model__regressor__alpha': [0.1, 0.5, 1, 2, 5],}
# model = mord.OrdinalRidge()
# model_pg = {'model__regressor__alpha': stats.chi2(df=2), }
model = ensemble.RandomForestRegressor()
model_pg = {'model__regressor__n_estimators': stats.randint(3, 100), 'model__regressor__max_depth': stats.randint(2, 20), 'model__regressor__max_features': [.05, .25, 'auto', 'sqrt', 'log2']}
# model = neural_network.MLPRegressor(learning_rate='adaptive')
# model_pg = {'model__regressor__hidden_layer_sizes': [(100), (50, 50)]}

clip_out = preprocessing.FunctionTransformer(np.clip, kw_args={'a_min': 0, 'a_max': 4})
clipped_model = compose.TransformedTargetRegressor(regressor=model, inverse_func=clip_out.transform)

pipe = pipeline.Pipeline([
    ('scaler', scaler), 
    ('f_select', f_select), 
    ('model', clipped_model),
], verbose=1)

param_grid = {
    **scaler_pg,
    **f_select_pg,
    **model_pg,
}

metric = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

cv = model_selection.StratifiedKFold(shuffle=True)
search = RandomizedSearchCV(pipe, param_grid, n_iter=300, scoring=metric, cv=cv, refit=False, scheduler=client)

In [None]:
for label in ['on_off', 'dyskinesia', 'tremor']:
    client.restart()
    
    id_cols = ['measurement_id', 'samp_id']
    features = df.dropna(subset=[label]).drop(columns=[*label_cols, *id_cols])
#     features = features.persist() 

    y = df.loc[features.index, label].astype('int')
    X = features
    
    search = RandomizedSearchCV(pipe, param_grid, n_iter=300, scoring=metric, cv=cv, refit=False, scheduler=client)
    cv_fit = search.fit(X, y)
    cv_results = pd.DataFrame(cv_fit.cv_results_)

    resultset_name = f'{dataset}_{type(model).__name__}_{label}'
    cv_results.to_csv(f'performance/cv_paramsweeps/{resultset_name}.csv')
    win_params = cv_results.loc[cv_results.rank_test_score == 1, 'params'].values[0]
    winner = pipe.set_params(**win_params)
    with open(f'models/paramsweep_winners/{resultset_name}.model', 'wb') as f:
        dill.dump(winner, f)

## Single train-test split for evaluation

In [None]:
# label = ''
with open(f'models/paramsweep_winners/RandomForestRegressor_{label}.model', 'rb') as f:
    winner = dill.load(f)

In [None]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X.compute(), y.compute(), test_size=.25, stratify=y.compute())
# x_train, y_train = smote.fit_resample(x_train, y_train)

with joblib.parallel_backend('loky'):
    winner.fit(x_train, y_train)
    pred = winner.predict(x_test)

main.plot_performance(y_test, pred)

# Baseline for reference

In [None]:
# label = 'dyskinesia'
features = df.dropna(subset=[label]).drop(columns=[*label_cols, *id_cols])

y = df.loc[features.index, label].astype('int').compute()
metric = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

In [None]:
baseline_model = dummy.DummyRegressor(strategy='mean')
# Pass in y for X because we don't actually care about X
baseline_cv = model_selection.cross_validate(baseline_model, y, y, scoring=metric)
baseline_scores = baseline_cv['test_score']
ax = sns.countplot(y)
ax.set_title('mse of null model: %f' % baseline_scores.mean())

In [None]:
# patient-specific mean predictor
subj_means = labels.groupby('subject_id').mean()
X_subjs = df.loc[X.index][['subject_id']]
naive_pred = X_subjs.merge(subj_means[[label]], left_on='subject_id', right_index=True).rename(columns={label: 'prediction'})
main.plot_performance(y, naive_pred.prediction)

# Predictions on test set

In [None]:
label = 'tremor'
with open(f'models/paramsweep_winners/RandomForestRegressor_{label}.model', 'rb') as f:
    winner = dill.load(f)

In [None]:
# TODO: only predict required measurements for each label
test_index = pd.read_csv(f'test_predictions/sub_template_{label}.csv', index_col=0).index
test_features_df = pd.concat([
    pd.read_csv('extracted_features/tsfeatures_cis_test.csv', index_col=0), 
    pd.read_csv('extracted_features/tsfeatures_real_test.csv', index_col=0)
]).drop(columns=drop_cols).reindex(test_index)

test_subjs = pd.concat([
    pd.read_csv('data/test_set/cis-pd/cis-pd.CIS-PD_Test_Data_IDs.csv', index_col=0), 
    pd.read_csv('data/test_set/real-pd/real-pd.REAL-PD_Test_Data_IDs.csv', index_col=0)
]).reindex(test_index)

In [None]:
# Predict patient-specific mean if data not available
nodata_obs = test_subjs.loc[test_features_df[test_features_df.isna().sum(axis=1) > 0].index]
nodata_predictions = nodata_obs.join(subj_means, on='subject_id')[[label]].rename({label: 'prediction'}, axis=1)

In [None]:
X = test_features_df.dropna(axis='index')

test_predictions = winner.predict(X)
test_predictions_df = pd.concat([
    pd.DataFrame(index=X.index, data={'prediction': test_predictions}),
    nodata_predictions,
], axis=0)

In [None]:
test_predictions_df.to_csv(f'test_predictions/test_predictions_{label}.csv', index=True)

# Dimensionality Reduction

In [None]:
label = 'subject_id'

X = f_select.fit_transform(scaler.fit_transform(features), y=y)
pca = decomposition.FastICA(n_components=2)
proj = pca.fit_transform(X)
fig = plt.figure(figsize=(8, 8))
_ = sns.scatterplot(x=proj[:, 0], y=proj[:, 1], hue=df.loc[features.index, label], legend='full')

In [None]:
# Local cluster for debugging
try:
    local_cluster.close()
    local_client.close()
except NameError:
    pass
finally:
    local_cluster = LocalCluster(n_workers=4, threads_per_worker=1, dashboard_address='0.0.0.0:8786')
    local_client = Client(local_cluster)
    local_cluster.adapt(minimum=0, maximum=4)