In [28]:
"""
Author: Zaw
Predicting the gain of smooth pursuit eye movement 
"""

import numpy as np
import pandas as pd
import math 
import time
import os
import sys
import warnings
warnings.simplefilter(action='ignore')
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from hyperopt import hp
#start = time.time()
import pickle

### Reading dataset

In [22]:
#data path
path = os.getcwd()
dirname = os.path.dirname(path)
data_file = 'data/data.csv'
data_path = os.path.join(dirname,data_file)

In [23]:
# load dataset
data = pd.read_csv(data_path)

In [24]:
data.head()

Unnamed: 0,Label,Patient,A1_1Deg,A1_1Deg_err,B1_1Deg,B1_1Deg_err,C1_1Deg,C1_1Deg_err,1_Deg_ResSoS_1,1Deg_AdjR2_1,...,8_Deg_ResSoS_1,8Deg_AdjR2_1,A2_8Deg,A2_8Deg_err,B2_8Deg,B2_8Deg_err,C2_8Deg,C2_8Deg_err,8_Deg_ResSoS_2,8Deg_AdjR2_2
0,Not_Parkinson's,PD001,10.38819,0.006132,0.049329,6e-06,0.011989,0.000337,8928.709194,0.990598,...,6698.838281,0.951413,10.273124,0.044206,0.386347,0.000384,0.037356,0.002439,6809.4558,0.939095
1,Not_Parkinson's,PD002,10.429055,0.006929,0.049478,7e-06,0.002837,0.000379,11388.604704,0.988092,...,4769.901822,0.96736,11.4825,0.049154,0.395412,0.00038,-0.002486,0.002419,8383.48184,0.939586
2,Not_Parkinson's,PD003,10.097088,0.005091,0.049368,6e-06,0.005707,0.000287,6140.292312,0.993141,...,3488.747176,0.968709,10.084338,0.019838,0.394595,0.000174,-0.035466,0.001106,1349.96224,0.987259
3,Not_Parkinson's,PD004,10.482881,0.022179,0.049585,2.3e-05,0.016056,0.001212,117748.97971,0.879372,...,10972.045844,0.920137,11.493414,0.048867,0.401386,0.000377,-0.042463,0.002395,8242.136178,0.94037
4,Not_Parkinson's,PD005,10.181273,0.008833,0.049368,1e-05,0.001569,0.000494,18447.340517,0.979692,...,18268.960666,0.809586,10.370348,0.036121,0.395392,0.000308,-0.030769,0.00196,4485.300264,0.960035


### Add calculated gain

In [25]:
OneDegGain = np.array([
                    0.912073221615219,
                    0.913615300167855,
                    0.946453363089275,
                    0.930324124079886,
                    1.02416020436668,
                    0.860239837548296,
                    0.964705222165739,
                    0.926124390992111,
                    0.968894500894799,
                    0.971483673217362,
                    0.944374366569671,
                    0.933778991410942,
                    0.890761283462127,

                    1.01702677636733,
                    1.01132254481336,
                    0.860332024178068,
                    0.898244125409108,
                    0.79226520992959,
                    0.86763213956782,

                    1.04294995675946,
                    0.954852019394754,
                    0.83139435252038,
                    0.969027023632338,
                    0.819025317130086,
                    1.59441787272771,
                    0.827558754578517,
                    0.882328237761475,
                    0.888806663106777  
                ])

TowDegGain  = np.array([
                    0.88795482315922,
                    0.85125583179904,
                    0.90557113971074,
                    0.839308531914961,
                    0.944624517823793,
                    0.936063494064383,
                    0.919761689613053,
                    0.857703024614198,
                    0.973218178159532,
                    0.936570806828469,
                    0.955653090017782,
                    0.943385867113339,
                    1.0186446733178,

                    0.957719524164782,
                    0.847863270306327,
                    0.762131387734656,
                    0.71762368625312,
                    0.699023420441996,
                    0.860669705999712,
                    
                    0.882514198813699,
                    0.900439093391636,
                    0.852845556682743,
                    0.662387043701962,
                    1.02691189892667,
                    0.929980500477329,
                    0.795054974778894,
                    0.915409616170048,
                    0.936562782215738
])

FourDegGain = np.array([
                    0.842657428787201,
                    0.927453824629543,
                    0.86099798858682,
                    0.826284460023473,
                    0.907866303422781,
                    0.886319852935139,
                    0.879363613650586,
                    0.945246226705543,
                    0.865730996104868,
                    0.949044695789533,
                    0.927054434038832,
                    0.907460103985577,
                    0.768451296309936,

                    0.670432552659765,
                    0.82906287014668,
                    0.792533491957787,
                    0.564497325152498,
                    0.566934462769341,
                    0.745104906289544,

                    0.863969643770036,
                    0.810695262798471,
                    0.665919934919724,
                    0.531701684005337,
                    0.804825842988611,
                    0.849537457619557,
                    0.843883703581647,
                    0.696788329556185,
                    0.878210689819177
])

SixDegGain = np.array([
                    0.795081992082097,
                    0.810824945533769,
                    0.776092810922058,
                    0.667632972614243,
                    0.875563679555973,
                    0.832291613200056,
                    0.896167465701247,
                    0.838934215329476,
                    0.91951999621913,
                    0.931875303042642,
                    0.881653294348028,
                    0.866711106710116,
                    0.950478365248271,

                    0.876172814217401,
                    0.793820437144166,
                    0.674619968989113,
                    0.508275303789709,
                    0.454613572040806,
                    0.761053355757645,

                    0.708138512300574,
                    0.875686044405767,
                    0.62197374329538,
                    0.682965128345167,
                    0.745383783235028,
                    0.904919560234923,
                    0.97349927736433,
                    0.790221320229067,
                    0.96896278814672
])

EightDegGain = np.array([
                    0.738482610210994,
                    0.986636423035039,
                    0.745501747649165,
                    0.757990700258398,
                    0.909797906001487,
                    0.781053010159343,
                    0.85865978762173,
                    0.859516710190982,
                    0.885411917285177,
                    0.828367438419201,
                    0.911980104842521,
                    0.863014297113752,
                    0.708399551278433,

                    0.673980441334396,
                    0.795542507563886,
                    0.795542507563886,
                    0.570216915379213,
                    0.470689666436208,
                    0.816745486005565,

                    0.730673411270229,
                    0.84023883503095,
                    0.634293750883143,
                    0.529398709071705,
                    0.750469831451345,
                    0.748589580457503,
                    0.881237281923916,
                    0.750773552799239,
                    0.972603229161408
])

In [26]:
# Array to dataframe
gain = pd.DataFrame({'OneDegGain':OneDegGain,'TowDegGain':TowDegGain,'FourDegGain':FourDegGain,
                    'SixDegGain':SixDegGain,'EightDegGain':EightDegGain})

In [27]:
# Concat with main dataframe
gain_df = pd.concat([data, gain], axis=1)

In [29]:
# Standardization
sc = StandardScaler()
data_input = pd.DataFrame(sc.fit_transform(gain_df.iloc[:,2:]),index=gain_df.iloc[:,2:].index, columns= gain_df.iloc[:,2:].columns)

# Features and labels
x = data_input.iloc[:,2:]
y = gain_df.iloc[:,0]

## Convert the categorical variables to number
LabelEncoder_gender = LabelEncoder()
y = LabelEncoder_gender.fit_transform(y)

# convert array to series 
y = pd.Series(y,name='Label')
df = pd.concat([x,y],axis=1)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29 entries, 0 to 28
Data columns (total 84 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   B1_1Deg         29 non-null     float64
 1   B1_1Deg_err     29 non-null     float64
 2   C1_1Deg         29 non-null     float64
 3   C1_1Deg_err     29 non-null     float64
 4   1_Deg_ResSoS_1  29 non-null     float64
 5   1Deg_AdjR2_1    29 non-null     float64
 6   A2_1Deg         29 non-null     float64
 7   A2_1Deg_err     29 non-null     float64
 8   B2_1Deg         29 non-null     float64
 9   B2_1Deg_err     29 non-null     float64
 10  C2_1Deg         29 non-null     float64
 11  C2_1Deg_err     29 non-null     float64
 12  1_Deg_ResSoS_2  29 non-null     float64
 13  1Deg_AdjR2_2    29 non-null     float64
 14  A1_2Deg         29 non-null     float64
 15  A1_2Deg_err     29 non-null     float64
 16  B1_2Deg         29 non-null     float64
 17  B1_2Deg_err     29 non-null     float

In [32]:
#Check NaN
df.isnull().values.any()

True

In [33]:
# Filling nan with mean values 
for col in df:
    df[col].fillna(df[col].mean(),inplace=True)

In [34]:
#Check NaN again
df.isnull().values.any()

False

In [36]:
df.shape

(29, 84)

In [37]:
# Remove features with low variance (thrshold <= 0.05)
df_var = df.var()
df_feat = list(df_var[df_var <= 0.05].index)
df.drop(columns=df_feat, inplace=True)

In [38]:
df.shape

(29, 84)

In [41]:
# Remove features with correlation(threshold > 0.95)
corr_matrix = df.corr().abs()
mask = np.triu(np.ones_like(corr_matrix,dtype=bool))
tri_df = corr_matrix.mask(mask)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)]
data = df.drop(to_drop,axis=1)