In [1]:
import tsfresh as tsf
import glob
import os
import numpy as np
import pandas as pd
import seaborn as sns
import mord
import joblib

from importlib import reload
from scipy import signal, stats
from tqdm.auto import tqdm
from sklearn import preprocessing, neighbors, linear_model, ensemble, decomposition, compose
from sklearn import feature_selection, metrics, dummy, pipeline, svm, neural_network, model_selection
from dask_ml.model_selection import RandomizedSearchCV
from matplotlib import pyplot as plt
from src import main, feature_model



In [2]:
features_df = pd.read_csv('tsfeatures.csv', index_col=0)

label_cols = ['on_off', 'dyskinesia', 'tremor', 'subject_id']
labels = pd.read_csv('data/cis-pd/data_labels/CIS-PD_Training_Data_IDs_Labels.csv', index_col=0)

# These features don't compute for a number of observations
drop_cols = ['rms__friedrich_coefficients__m_3__r_30__coeff_0',
       'rms__friedrich_coefficients__m_3__r_30__coeff_1',
       'rms__friedrich_coefficients__m_3__r_30__coeff_2',
       'rms__friedrich_coefficients__m_3__r_30__coeff_3',
       'rms__max_langevin_fixed_point__m_3__r_30']
df = labels[label_cols].merge(features_df.drop(columns=drop_cols), left_index=True, right_index=True)

In [3]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client, LocalCluster

In [23]:
scaler = preprocessing.RobustScaler(quantile_range=(1, 99))
scaler_pg= {'scaler__quantile_range': [(.1, 99.9), (.5, 99.5), (1, 99), (5, 95), (10, 90)],}

# Keep features w/ variance in top 95%ile 
var = lambda X, y: np.var(X, axis=0)
f_select = feature_selection.SelectPercentile(var, percentile=95)
# f_select_pg = {'f_select__percentile': [95, 80, 50, 25, 10],}
f_select_pg = {'f_select__percentile': stats.uniform(0, 100)}
# f_select = feature_selection.SelectKBest(feature_selection.mutual_info_regression, k=30)


model = ensemble.RandomForestRegressor()
model_pg = {'model__regressor__n_estimators': [30], 'model__regressor__max_depth': stats.uniform(0, 100)}
# model_pg = {'n_estimators': [300], 'max_depth': stats.uniform(0, 100)}

clip_out = preprocessing.FunctionTransformer(np.clip, kw_args={'a_min': 0, 'a_max': 4})
clipped_model = compose.TransformedTargetRegressor(regressor=model, inverse_func=clip_out.transform)

metric = metrics.make_scorer(metrics.mean_squared_error, greater_is_better=False)

pipe = pipeline.Pipeline([
    ('scaler', scaler), 
    ('f_select', f_select), 
    ('model', clipped_model),
], verbose=1)

param_grid = {
    **scaler_pg,
    **f_select_pg,
    **model_pg,
}

cv = model_selection.StratifiedKFold(shuffle=True)
search = RandomizedSearchCV(pipe, param_grid, n_iter=50, scoring=metric, cv=cv)

In [5]:
label = 'tremor'
features = df[df.index.isin(df.dropna(subset=[label]).index)].drop(columns=[*label_cols])

y = df.loc[features.index, label].astype('int')
X = features

In [17]:
try:
    local_cluster.close()
except NameError:
    pass
finally:
    local_cluster = LocalCluster(n_workers=4, threads_per_worker=1, dashboard_address='0.0.0.0:8786')
    local_client = Client(local_cluster)
    with joblib.parallel_backend('dask', client=client, scatter=[X, y]):
        cv_fit = search.fit(X, y)
    # cv_results = pd.DataFrame(cv_fit.cv_results_)

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError


KeyboardInterrupt: 

In [None]:
local_cluster.scheduler_info

In [None]:
local_client.status

In [6]:
cluster = SLURMCluster(queue='short', cores=1, memory='4gb', walltime='30:00', death_timeout=300)
client = Client(cluster)

In [16]:
cluster.scale(10)

In [9]:
client

0,1
Client  Scheduler: tcp://10.120.16.230:58325  Dashboard: http://10.120.16.230:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.00 GB


In [24]:
with joblib.parallel_backend('dask', client=client, scatter=[X, y]):
    cv_fit = search.fit(X, y)

distributed.utils - ERROR - '<' not supported between instances of 'NoneType' and 'tuple'
Traceback (most recent call last):
  File "/home/hy180/anaconda3/lib/python3.7/site-packages/distributed/utils.py", line 665, in log_errors
    yield
  File "/home/hy180/anaconda3/lib/python3.7/site-packages/distributed/dashboard/components/scheduler.py", line 1739, in graph_doc
    graph = TaskGraph(scheduler, sizing_mode="stretch_both")
  File "/home/hy180/anaconda3/lib/python3.7/site-packages/distributed/dashboard/components/scheduler.py", line 1121, in __init__
    self.layout = GraphLayout(scheduler)
  File "/home/hy180/anaconda3/lib/python3.7/site-packages/distributed/diagnostics/graph_layout.py", line 39, in __init__
    self.scheduler, dependencies=dependencies, priority=priority
  File "/home/hy180/anaconda3/lib/python3.7/site-packages/distributed/diagnostics/graph_layout.py", line 43, in update_graph
    stack = sorted(dependencies, key=lambda k: priority.get(k, 0), reverse=True)
TypeErr

In [25]:
cv_fit

RandomizedSearchCV(cache_cv=True,
                   cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
                   error_score='raise',
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              RobustScaler(copy=True,
                                                           quantile_range=(1,
                                                                           99),
                                                           with_centering=True,
                                                           with_scaling=True)),
                                             ('f_select',
                                              SelectPercentile(percentile=95,
                                                               score_func=<function <lambda> at 0x7f131dbe98c0>)),
                                             ('model',
                                         

In [30]:
df = pd.DataFrame(cv_fit.cv_results_)
df.sort_values(by='mean_test_score', ascending=False)

Unnamed: 0,params,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,param_f_select__percentile,param_model__regressor__max_depth,param_model__regressor__n_estimators,param_scaler__quantile_range
6,"{'f_select__percentile': 70.34484426282532, 'm...",9.11735,0.629877,0.014388,0.007063,-0.671047,-0.694255,-0.724973,-0.695065,-0.679129,-0.69288,0.018449,1,70.3448,54.5801,30,"(0.5, 99.5)"
28,"{'f_select__percentile': 31.78512412752439, 'm...",4.460455,0.320579,0.011117,0.002389,-0.701259,-0.672727,-0.714307,-0.713463,-0.6646,-0.693263,0.020769,2,31.7851,38.1283,30,"(0.1, 99.9)"
21,"{'f_select__percentile': 39.1473451896272, 'mo...",3.075471,0.27025,0.008354,0.001075,-0.688833,-0.699353,-0.695199,-0.710485,-0.679974,-0.694768,0.010216,3,39.1473,6.6179,30,"(0.1, 99.9)"
11,"{'f_select__percentile': 97.62307938101686, 'm...",12.954676,1.425851,0.009334,0.001268,-0.726511,-0.667622,-0.716069,-0.720228,-0.65796,-0.697677,0.028842,4,97.6231,96.1859,30,"(5, 95)"
38,"{'f_select__percentile': 94.91507175566345, 'm...",12.520334,0.601928,0.010135,0.001156,-0.703591,-0.709636,-0.706887,-0.718037,-0.668493,-0.701336,0.017092,5,94.9151,76.1217,30,"(0.1, 99.9)"
42,"{'f_select__percentile': 81.52353974823325, 'm...",10.561077,0.203605,0.008868,0.000447,-0.703447,-0.699537,-0.727888,-0.702306,-0.679947,-0.702623,0.01524,6,81.5235,83.3297,30,"(0.5, 99.5)"
8,"{'f_select__percentile': 52.40693353548963, 'm...",7.443724,0.185149,0.013055,0.006906,-0.701444,-0.688051,-0.727641,-0.709353,-0.7068,-0.706641,0.012815,7,52.4069,26.5321,30,"(0.1, 99.9)"
0,"{'f_select__percentile': 68.61757512469508, 'm...",9.125982,0.244057,0.009899,0.000755,-0.66526,-0.703808,-0.735885,-0.73481,-0.698365,-0.707594,0.026208,8,68.6176,18.215,30,"(0.5, 99.5)"
7,"{'f_select__percentile': 44.494033992680215, '...",6.229875,0.431822,0.009204,0.000469,-0.690023,-0.706469,-0.753261,-0.703154,-0.685339,-0.707636,0.024115,9,44.494,43.8047,30,"(0.5, 99.5)"
35,"{'f_select__percentile': 93.51078575433853, 'm...",12.51185,0.417619,0.009673,0.001449,-0.730258,-0.717122,-0.750088,-0.712614,-0.636001,-0.709236,0.038831,10,93.5108,39.7584,30,"(1, 99)"


In [37]:
cv_fit.best_estimator_

Pipeline(memory=None,
         steps=[('scaler',
                 RobustScaler(copy=True, quantile_range=(0.5, 99.5),
                              with_centering=True, with_scaling=True)),
                ('f_select',
                 SelectPercentile(percentile=70.34484426282532,
                                  score_func=<function <lambda> at 0x7f1317fb3b00>)),
                ('model',
                 TransformedTargetRegressor(check_inverse=True, func=None,
                                            inverse_func=<bound method FunctionTransformer.transfo...
                                                                            ccp_alpha=0.0,
                                                                            criterion='mse',
                                                                            max_depth=54.58008647530026,
                                                                            max_features='auto',
                                          

In [38]:
foo = cv_fit.best_estimator_.named_steps['f_select']

In [46]:
pd.Series(cv_fit.best_estimator_.named_steps['model'].regressor_.feature_importances_, index=X.columns[foo.get_support()]).sort_values()

rms__has_duplicate                                                 0.000000
rms__has_duplicate_min                                             0.000000
rms__large_standard_deviation__r_0.1                               0.000000
rms__large_standard_deviation__r_0.15000000000000002               0.000000
rms__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.6    0.000024
                                                                     ...   
rms__autocorrelation__lag_9                                        0.014570
rms__ar_coefficient__k_10__coeff_3                                 0.014654
rms__ar_coefficient__k_10__coeff_1                                 0.024394
rms__fft_aggregated__aggtype_"variance"                            0.041821
rms__number_peaks__n_1                                             0.057817
Length: 527, dtype: float64

In [36]:
len(cv_fit.best_estimator_.named_steps['model'].regressor_.feature_importances_)

527