In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
from os.path import join
import os
import json
import math

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
DATA_DIR = "data"
TSF_WINDOW_DIR = join(DATA_DIR, "cis-pd", "training_data_tsf")
TSF_WINDOW_FILES = [ join(TSF_WINDOW_DIR, f) for f in os.listdir(TSF_WINDOW_DIR) if f.endswith(".tsf.csv") ]
LABELS_FILE = join(DATA_DIR, "cis-pd", "data_labels", "CIS-PD_Training_Data_IDs_Labels.csv")

In [4]:
labels_df = pd.read_csv(LABELS_FILE, index_col=0)
labels_df.head()

m_ids = [ os.path.basename(f[:-8]) for f in TSF_WINDOW_FILES ]
labels_df = labels_df.loc[m_ids,:]
labels_df = labels_df.sort_values(by="on_off", ascending=False)
labels_df.shape

(327, 4)

In [7]:
with open(join(DATA_DIR, "tsf_window_variables.json")) as f:
    top_vars = json.load(f)

In [143]:
X_list = []
for m_id in labels_df.index.values.tolist():
    f = join(TSF_WINDOW_DIR, f"{m_id}.tsf.csv")
    m_df = pd.read_csv(f, index_col=0)
    
    m_X_df = pd.DataFrame(data=[], index=[], columns=["stat", "variable", "value", "dim"])
    for dim, dim_df in m_df.groupby("id"):
        dim_df = dim_df.set_index("window_start", drop=True)
        #dim_df = dim_df[top_vars]
        dim_summary_df = dim_df.describe().reset_index()
        dim_summary_df = dim_summary_df.melt(id_vars=["index"]).rename(columns={"index": "stat"})
        dim_summary_df = dim_summary_df.loc[dim_summary_df["stat"].isin(["mean", "std"])]
        dim_summary_df["dim"] = dim
        m_X_df = m_X_df.append(dim_summary_df, ignore_index=True)
    X_list.append(m_X_df["value"].values)
len(X_list)

327

In [68]:
X = np.stack(X_list, axis=-1).T
X.shape

(327, 480)

In [69]:
y = []
for m_id in labels_df.index.values.tolist():
    y.append(labels_df.at[m_id, "on_off"])
y = np.array(y)
X = X[~np.isnan(y)]
y = y[~np.isnan(y)]

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [135]:
clf = RandomForestClassifier(n_estimators=1000, bootstrap=False, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [139]:
y_pred = clf.predict(X_test)

In [142]:
mean_squared_error(y_test, y_pred)

1.7254901960784315