In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import os

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
BASELINE_START = "baselinestart"
BASELINE_END = "baselineend"
EASY_START = "easystart"
EASY_END = "easyend"
HARD_START = "hardstart"
HARD_END = "hardend"

In [4]:
"""Gets the row blocks for easy and hard tasks
"""
def read_data(fnirs_path, marker_path):
    fnirs_df =  pd.read_csv(fnirs_path, sep='\t', skiprows=range(4), index_col=False)
    marker_df = pd.read_csv(marker_path, sep='\t', skiprows=range(4), index_col=False)
    
    merged_df = pd.merge(fnirs_df, marker_df, on="Matlab_now", how="left")
    
    return merged_df

In [5]:
def get_row_blocks(merged_df):
    easy_start_rows = merged_df.index[merged_df.Stimulus_Label == EASY_START].tolist()
    easy_end_rows = merged_df.index[merged_df.Stimulus_Label == EASY_END].tolist()
    hard_start_rows = merged_df.index[merged_df.Stimulus_Label == HARD_START].tolist()
    hard_end_rows = merged_df.index[merged_df.Stimulus_Label == HARD_END].tolist()
    
    easy_rows = list(zip(easy_start_rows, easy_end_rows))
    hard_rows = list(zip(hard_start_rows, hard_end_rows))
    
    return (easy_rows, hard_rows)

In [6]:
"""Return subset of df determined by the indices of the row blocks
"""
def get_subsets(merged_df, row_blocks):
    tables = []
    column_names = ["Matlab_now", "A-DC1", "A-DC2", "A-DC3", "A-DC4", "A-DC5",
                    "A-DC6", "A-DC7", "A-DC8", "B-DC1", "B-DC2", "B-DC3", 
                    "B-DC4", "B-DC5", "B-DC6", "B-DC7", "B-DC8"]
    column_indices = [merged_df.columns.get_loc(c) for c in column_names]
    for row_block in row_blocks:
        df = merged_df.iloc[row_block[0]:row_block[1], column_indices]
        start_time = df.iloc[0]["Matlab_now"]
        df["Matlab_now"] = df["Matlab_now"] - start_time

        tables.append(df)
    return tables

In [7]:
"""runs polyfit on the timeseries data
    :param tables: table of blocks of easy / hard tasks
    :param difficulty: 0 - easy, 1 - hard; labeling process
    
    :return: numpy array of
    feature_row: 
        AC-1 gradient, AC-1 intercept, AC-1 mean, AC-2 gradient ... DC-8 gradient DC-8 intercept DC-8 mean difficulty
    
    
    Dictionary of key: channel
                           value: (gradient, mean, difficulty)
"""
def extract_features(tables, difficulty):
    column_names = [
                    "A-DC1_mean", "A-DC1_slope", "A-DC1_intercept",
                    "A-DC2_mean", "A-DC2_slope", "A-DC2_intercept",
                    "A-DC3_mean", "A-DC3_slope", "A-DC3_intercept",
                    "A-DC4_mean", "A-DC4_slope", "A-DC4_intercept",
                    "A-DC5_mean", "A-DC5_slope", "A-DC5_intercept",
                    "A-DC6_mean", "A-DC6_slope", "A-DC6_intercept",
                    "A-DC7_mean", "A-DC7_slope", "A-DC7_intercept",
                    "A-DC8_mean", "A-DC8_slope", "A-DC8_intercept",
                    "B-DC1_mean", "B-DC1_slope", "B-DC1_intercept",
                    "B-DC2_mean", "B-DC2_slope", "B-DC2_intercept",
                    "B-DC3_mean", "B-DC3_slope", "B-DC3_intercept",
                    "B-DC4_mean", "B-DC4_slope", "B-DC4_intercept",
                    "B-DC5_mean", "B-DC5_slope", "B-DC5_intercept",
                    "B-DC6_mean", "B-DC6_slope", "B-DC6_intercept",
                    "B-DC7_mean", "B-DC7_slope", "B-DC7_intercept",
                    "B-DC8_mean", "B-DC8_slope", "B-DC8_intercept",
                    "difficulty"
                    ]
    df = pd.DataFrame(columns=column_names)
    data = []
    for table in tables:
        x = table["Matlab_now"].values
        cols = table.columns[1:]

        feature_row_dict = {}
        for col in cols:
            y = table[col].values
            slope, intercept = np.poly1d(np.polyfit(x, y, 1))
            avg = table[col].mean()
            mean_col = col + "_mean"
            slope_col = col + "_slope"
            intercept_col = col + "_intercept"
            feature_row_dict[mean_col] = avg
            feature_row_dict[slope_col] = slope
            feature_row_dict[intercept_col] = intercept

        feature_row_dict["difficulty"] = difficulty
        data.append(feature_row_dict)
        
    return pd.DataFrame(data, columns=column_names)

In [8]:
"""Extract features from given dataset
    :param data_path: Directory containing the files
    
    :return: gets all the easy and hard features from a given dataset
"""
def get_features_for_dataset(data_path):
    fnirs_path = os.path.join(os.getcwd(), data_path, "fNIRSdata.txt")
    marker_path = os.path.join(os.getcwd(), data_path, "markers.txt")
    merged_df = read_data(fnirs_path, marker_path)
    easy_rows, hard_rows = get_row_blocks(merged_df)
    
    easy_tables = get_subsets(merged_df, easy_rows)
    hard_tables = get_subsets(merged_df, hard_rows)
    easy_feature_rows = extract_features(easy_tables, 0)
    hard_feature_rows = extract_features(hard_tables, 1)
    
    features = easy_feature_rows.append(hard_feature_rows, ignore_index=True)

    return features

In [9]:
features_902 = get_features_for_dataset("/Users/sjjin/workspace/hci_lab/data/S902/2015-02-26_11-24-48-120")
features_903 = get_features_for_dataset("/Users/sjjin/workspace/hci_lab/data/S903/2015-02-27_13-20-42-120")
features_904 = get_features_for_dataset("/Users/sjjin/workspace/hci_lab/data/S904/2015-02-27_15-30-27-120")
features_905 = get_features_for_dataset("/Users/sjjin/workspace/hci_lab/data/S905/2015-03-02_13-14-35-120")
features_906 = get_features_for_dataset("/Users/sjjin/workspace/hci_lab/data/S906/2015-03-05_11-17-38-120")

train_set = features_902.append([features_903, features_904, features_905], ignore_index=True)
test_set = features_906

In [10]:
features_902.head()

Unnamed: 0,A-DC1_mean,A-DC1_slope,A-DC1_intercept,A-DC2_mean,A-DC2_slope,A-DC2_intercept,A-DC3_mean,A-DC3_slope,A-DC3_intercept,A-DC4_mean,...,B-DC6_mean,B-DC6_slope,B-DC6_intercept,B-DC7_mean,B-DC7_slope,B-DC7_intercept,B-DC8_mean,B-DC8_slope,B-DC8_intercept,difficulty
0,1851.172316,-1.984235,1880.858525,466.353955,-0.530168,474.285813,128.55,-0.115788,130.282311,39.243842,...,112.805367,-0.069798,113.849613,35.587627,-0.032143,36.068515,12.32404,-0.015506,12.556023,0
1,1839.03966,-0.385561,1844.792114,466.923229,0.035657,466.39124,129.163739,-0.017192,129.420241,39.449632,...,114.330312,-0.018095,114.600281,35.867167,-0.015114,36.092656,12.427904,-0.013832,12.634279,0
2,1845.822034,-1.474416,1867.886219,473.236158,-0.378029,478.893247,131.143785,-0.105707,132.725658,39.88709,...,114.895198,-0.034731,115.414938,35.787119,-0.008634,35.916323,12.31452,-0.001905,12.343033,0
3,1817.669492,-1.093188,1834.03344,463.981073,-0.347829,469.187723,128.513277,-0.131384,130.479972,39.039605,...,113.402825,-0.03489,113.925098,34.930508,-0.014941,35.154165,12.004746,-0.010183,12.15717,0
4,1811.059322,-1.796173,1837.950809,463.242655,-0.588048,472.046646,128.80565,-0.165197,131.2789,39.188362,...,115.247175,-0.107922,116.862938,35.475056,-0.038056,36.044808,12.224944,-0.015189,12.452343,0


In [11]:
train_set.shape

(88, 49)

In [12]:
feature_columns = [
                    "A-DC1_mean", "A-DC1_slope", "A-DC1_intercept",
                    "A-DC2_mean", "A-DC2_slope", "A-DC2_intercept",
                    "A-DC3_mean", "A-DC3_slope", "A-DC3_intercept",
                    "A-DC4_mean", "A-DC4_slope", "A-DC4_intercept",
                    "A-DC5_mean", "A-DC5_slope", "A-DC5_intercept",
                    "A-DC6_mean", "A-DC6_slope", "A-DC6_intercept",
                    "A-DC7_mean", "A-DC7_slope", "A-DC7_intercept",
                    "A-DC8_mean", "A-DC8_slope", "A-DC8_intercept",
                    "B-DC1_mean", "B-DC1_slope", "B-DC1_intercept",
                    "B-DC2_mean", "B-DC2_slope", "B-DC2_intercept",
                    "B-DC3_mean", "B-DC3_slope", "B-DC3_intercept",
                    "B-DC4_mean", "B-DC4_slope", "B-DC4_intercept",
                    "B-DC5_mean", "B-DC5_slope", "B-DC5_intercept",
                    "B-DC6_mean", "B-DC6_slope", "B-DC6_intercept",
                    "B-DC7_mean", "B-DC7_slope", "B-DC7_intercept",
                    "B-DC8_mean", "B-DC8_slope", "B-DC8_intercept"
                    ]
difficulty = ["difficulty"]

In [13]:
train_x = train_set[feature_columns]
train_y = train_set[difficulty]
test_x = test_set[feature_columns]
test_y = test_set[difficulty]

In [14]:
len(train_x.columns)

48

In [15]:
train_x

Unnamed: 0,A-DC1_mean,A-DC1_slope,A-DC1_intercept,A-DC2_mean,A-DC2_slope,A-DC2_intercept,A-DC3_mean,A-DC3_slope,A-DC3_intercept,A-DC4_mean,...,B-DC5_intercept,B-DC6_mean,B-DC6_slope,B-DC6_intercept,B-DC7_mean,B-DC7_slope,B-DC7_intercept,B-DC8_mean,B-DC8_slope,B-DC8_intercept
0,1851.172316,-1.984235,1880.858525,466.353955,-0.530168,474.285813,128.550000,-0.115788,130.282311,39.243842,...,376.346990,112.805367,-0.069798,113.849613,35.587627,-0.032143,36.068515,12.324040,-0.015506,12.556023
1,1839.039660,-0.385561,1844.792114,466.923229,0.035657,466.391240,129.163739,-0.017192,129.420241,39.449632,...,376.713039,114.330312,-0.018095,114.600281,35.867167,-0.015114,36.092656,12.427904,-0.013832,12.634279
2,1845.822034,-1.474416,1867.886219,473.236158,-0.378029,478.893247,131.143785,-0.105707,132.725658,39.887090,...,381.074721,114.895198,-0.034731,115.414938,35.787119,-0.008634,35.916323,12.314520,-0.001905,12.343033
3,1817.669492,-1.093188,1834.033440,463.981073,-0.347829,469.187723,128.513277,-0.131384,130.479972,39.039605,...,376.514876,113.402825,-0.034890,113.925098,34.930508,-0.014941,35.154165,12.004746,-0.010183,12.157170
4,1811.059322,-1.796173,1837.950809,463.242655,-0.588048,472.046646,128.805650,-0.165197,131.278900,39.188362,...,384.586052,115.247175,-0.107922,116.862938,35.475056,-0.038056,36.044808,12.224944,-0.015189,12.452343
5,1786.807910,-0.643746,1796.435970,455.793220,-0.077850,456.957567,126.631356,-0.012623,126.820147,38.572203,...,374.291242,113.092090,-0.001152,113.109320,34.712797,-0.002086,34.743998,11.853079,-0.004310,11.917545
6,1698.646893,-2.157426,1730.952860,435.157345,-0.563777,443.599519,121.421186,-0.138119,123.489417,37.066130,...,372.839858,111.128531,-0.066724,112.127671,33.991045,-0.028000,34.410320,11.713136,-0.018177,11.985319
7,1690.658192,-2.555758,1728.875645,433.544915,-0.616226,442.759629,121.238983,-0.206060,124.320300,37.023418,...,370.633200,111.442090,-0.001255,111.460853,33.989153,-0.008277,34.112921,11.720932,-0.008704,11.851089
8,1688.745763,-1.280417,1707.926444,432.470904,-0.484249,439.724968,120.851977,-0.127277,122.758585,36.860028,...,376.114491,111.153390,-0.092101,112.533069,33.866723,-0.021939,34.195375,11.616215,-0.004976,11.690762
9,1649.881356,-1.637229,1674.438990,421.594350,-0.380145,427.296337,117.938136,-0.091738,119.314155,35.951158,...,363.010161,109.049718,0.022043,108.719077,33.398475,0.014741,33.177368,11.442797,0.005280,11.363596


In [16]:
train_x.shape

(88, 48)

In [17]:
train_y.shape

(88, 1)

In [18]:
train_x.isnull().values.any()

False

In [19]:
logisticRegr = LogisticRegression()
logisticRegr.fit(train_x, train_y.values.ravel())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [20]:
log_pred = logisticRegr.predict(test_x)
accuracy_score(test_y, log_pred)

0.47619047619047616

In [82]:
for i in range(10):
    clf=RandomForestClassifier(n_estimators=100)
    clf.fit(train_x, train_y.values.ravel())
    y_pred = clf.predict(test_x)
    print(accuracy_score(test_y, y_pred)) # ranges from 71 to 91%. Seems like there is a 'randomness' to the prediction

0.8571428571428571
0.9047619047619048
0.9047619047619048
0.7619047619047619
0.8095238095238095
0.9047619047619048
0.9523809523809523
0.8571428571428571
0.9047619047619048
0.9047619047619048


In [55]:
y_pred = clf.predict(test_x)

In [56]:
accuracy_score(test_y, y_pred) # ranges from 71 to 91%. Seems like there is a 'randomness' to the prediction

0.8095238095238095

In [24]:
train_set.columns

Index(['A-DC1_mean', 'A-DC1_slope', 'A-DC1_intercept', 'A-DC2_mean',
       'A-DC2_slope', 'A-DC2_intercept', 'A-DC3_mean', 'A-DC3_slope',
       'A-DC3_intercept', 'A-DC4_mean', 'A-DC4_slope', 'A-DC4_intercept',
       'A-DC5_mean', 'A-DC5_slope', 'A-DC5_intercept', 'A-DC6_mean',
       'A-DC6_slope', 'A-DC6_intercept', 'A-DC7_mean', 'A-DC7_slope',
       'A-DC7_intercept', 'A-DC8_mean', 'A-DC8_slope', 'A-DC8_intercept',
       'B-DC1_mean', 'B-DC1_slope', 'B-DC1_intercept', 'B-DC2_mean',
       'B-DC2_slope', 'B-DC2_intercept', 'B-DC3_mean', 'B-DC3_slope',
       'B-DC3_intercept', 'B-DC4_mean', 'B-DC4_slope', 'B-DC4_intercept',
       'B-DC5_mean', 'B-DC5_slope', 'B-DC5_intercept', 'B-DC6_mean',
       'B-DC6_slope', 'B-DC6_intercept', 'B-DC7_mean', 'B-DC7_slope',
       'B-DC7_intercept', 'B-DC8_mean', 'B-DC8_slope', 'B-DC8_intercept',
       'difficulty'],
      dtype='object')

In [25]:
feature_imp = pd.Series(clf.feature_importances_,index=feature_columns).sort_values(ascending=False)

In [26]:
feature_imp

B-DC7_slope        0.044995
B-DC6_slope        0.043932
A-DC7_slope        0.034317
B-DC5_slope        0.033123
A-DC2_slope        0.030545
A-DC3_mean         0.030499
B-DC8_slope        0.029352
B-DC6_mean         0.027178
A-DC8_slope        0.026821
B-DC7_intercept    0.025160
B-DC2_intercept    0.024286
B-DC1_slope        0.024000
B-DC5_mean         0.023983
A-DC6_slope        0.023332
B-DC6_intercept    0.022226
B-DC2_slope        0.021850
A-DC4_mean         0.021587
A-DC1_slope        0.021522
A-DC5_slope        0.020959
A-DC4_slope        0.020489
A-DC2_mean         0.020434
A-DC6_intercept    0.020420
A-DC5_intercept    0.019687
B-DC4_mean         0.019401
A-DC5_mean         0.019067
A-DC7_mean         0.018879
A-DC8_intercept    0.018256
A-DC6_mean         0.018245
A-DC3_slope        0.018201
A-DC8_mean         0.017560
B-DC4_slope        0.017197
B-DC1_mean         0.017191
B-DC3_mean         0.016737
A-DC7_intercept    0.016207
B-DC3_intercept    0.015869
B-DC4_intercept    0

In [90]:
feature_threshold = 0.025
selected_columns = feature_imp[(feature_imp > feature_threshold)].axes[0].tolist()

In [91]:
selected_columns

['B-DC7_slope',
 'B-DC6_slope',
 'A-DC7_slope',
 'B-DC5_slope',
 'A-DC2_slope',
 'A-DC3_mean',
 'B-DC8_slope',
 'B-DC6_mean',
 'A-DC8_slope',
 'B-DC7_intercept']

In [92]:
len(selected_columns)

10

In [94]:
for i in range(10):
    clf2=RandomForestClassifier(n_estimators=100)
    clf2.fit(train_x[selected_columns], train_y.values.ravel())
    y_pred2 = clf2.predict(test_x[selected_columns])
    print(accuracy_score(test_y, y_pred2)) # ranges from 71 to 91%. Seems like there is a 'randomness' to the prediction

0.6666666666666666
0.7142857142857143
0.7142857142857143
0.6666666666666666
0.7619047619047619
0.7142857142857143
0.6666666666666666
0.7142857142857143
0.7619047619047619
0.7142857142857143


**Random Forest**

More data

Hyper parameters

    !) More trees? Log of # of columns; 7~9
    2) Depth of each tree, keep it small (3~4)
    
feature importance: Cross entropy, gini

Might be overfitting

**LSTM**

put data into mfcc format: All input matrices must be the same shape

Might be overkill for the task, but we can scale down

usually, x10 more data = 5~10% improvement in performance

In [2]:
import torch.nn

ModuleNotFoundError: No module named 'torch'