Predictive models

In [107]:
import pandas as pd
# Load ABIDE data
url = 'https://github.com/jcbeer/predictiveViz/raw/master/data/abide_5252.csv'
df = pd.read_csv(url, index_col=False, header=None)
# first column is ID
# second column is Social Responsiveness Scale Score
# remaining columns are voxel level data
print(df.head(5))
df.shape

      0     1       2       3       4       5       6       7       8     \
0  50433.0   5.0 -0.0621 -0.0703  0.1083  0.1345 -0.0365  0.0839  0.0589   
1  50434.0  22.0  0.1438 -0.0196  0.0686  0.1574 -0.0495 -0.3282 -0.1005   
2  50435.0   6.0  0.1437 -0.0697  0.0613  0.1556  0.0228 -0.0978 -0.2464   
3  50436.0   9.0  0.2942  0.1377  0.1333  0.2336  0.1447  0.3166  0.1992   
4  50441.0  31.0  0.2639 -0.1567 -0.0040  0.3274  0.0209  0.0368 -0.1008   

     9      ...      5244    5245    5246    5247    5248    5249    5250  \
0  0.1294   ...    0.0027  0.0989  0.2442  0.2974  0.2802  0.3082  0.0073   
1 -0.2973   ...    0.2642  0.2452  0.2902  0.3234  0.0482  0.1741  0.3826   
2 -0.0511   ...    0.2201  0.1599 -0.0269 -0.0595 -0.0038  0.1384  0.1759   
3  0.1069   ...    0.4682  0.3324  0.0662  0.2098  0.2361  0.1049  0.0885   
4  0.0468   ...    0.2101 -0.1521 -0.0809 -0.1130  0.1173  0.1127  0.0463   

     5251    5252    5253  
0 -0.0596  0.1995  0.0153  
1 -0.0497 -0.1187 -0.024

(219, 5254)

In [108]:
total_n = df.shape[0]

In [109]:
# reorganize data
subid = df.iloc[:,0].astype(int)
y = df.iloc[:,1]
X = df.iloc[:,2:]

In [113]:
# randomly split into training and test
import math
# set proportion that is test set
test_proportion = 0.2
# set number that is test set
test_n = math.ceil(0.2*df.shape[0])
test_n
# randomly select test set
import numpy as np
# set random seed
np.random.seed(seed=1)
# divide indicies into training and test
test_ids = np.random.choice(total_n, test_n)
train_ids = np.delete(np.arange(df.shape[0] - 1), test_ids)
# divide data into training and test 
subid_train = subid.iloc[train_ids]
subid_test = subid.iloc[test_ids]
y_train = y.iloc[train_ids]
y_test = y.iloc[test_ids]
X_train = X.iloc[train_ids]
X_test = X.iloc[test_ids]

In [119]:
X.shape

(219, 5252)

In [146]:
# center the y data
# based on training set mean
y_train_mean = y_train.mean()
y_train_centered = y_train - y_train_mean
y_test_centered = y_test - y_train_mean
# standardize X data to have column mean zero and SD one
# based on training set column means and SD
from sklearn.preprocessing import StandardScaler
X_sc = StandardScaler()
X_sc.fit(X_train)
X_train_std = X_sc.transform(X_train)
X_test_std = X_sc.transform(X_test)

In [168]:
# Elastic net
from sklearn.linear_model import ElasticNet
regr = ElasticNet(random_state=0)
regr.fit(X_train, y_train_centered)
ElasticNet(alpha=1000, copy_X=True, fit_intercept=False, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=True, precompute=False,
      random_state=0, selection='cyclic', tol=0.001, warm_start=False)

ElasticNet(alpha=10000, copy_X=True, fit_intercept=False, l1_ratio=0.5,
      max_iter=10000, normalize=False, positive=True, precompute=False,
      random_state=0, selection='cyclic', tol=0.001, warm_start=False)

In [169]:
print(regr.coef_) 
print(regr.intercept_) 
print(regr.predict(X_test)) 

[-0. -0. -0. ...  0. -0.  0.]
-4.648244551748678
[ 10.95665347   9.54530341  11.77434359  -8.31273319  15.88230964
  11.82553492  19.82696129  11.42793642   6.59028228  20.01734938
  -1.32569202   4.0468558    1.10130011  -1.04360459   4.41893686
 -11.41031039  22.20908381 -11.2558007   17.21003679 -11.97606753
   8.85414668 -17.30164514  11.99850943   3.27995618   0.70192586
   1.79318069   1.79318069   7.77317595  13.87182768  -1.75872117
  -8.31273319  -9.75951265  -5.17075994 -11.52161578  -1.11353472
   1.70495439   6.65893909   3.41444849  10.09198474  -5.82302054
  16.95778226   9.60272145  -1.75872117   0.68088331]


In [141]:
# calculate prediction error
pred_y_test = regr.predict(X_test)
print(y_test)
print(pred_y_test)
np.subtract(pred_y_test, y_test + y_train_mean)

37     148.0
140      7.0
72      63.0
137      5.0
203    123.0
133    118.0
79      68.0
192     26.0
144     26.0
129     73.0
204     79.0
71      48.0
134     66.0
25       6.0
178     13.0
20       9.0
101     95.0
146     50.0
212     15.0
139     29.0
156     15.0
157     52.0
142     31.0
50      93.0
68      85.0
215     23.0
215     23.0
96      99.0
86      80.0
141     40.0
137      5.0
7       35.0
63       3.0
61       2.0
22       9.0
57       4.0
1       22.0
128     90.0
60      35.0
209     77.0
8        6.0
216     12.0
141     40.0
115     82.0
Name: 1, dtype: float64
[69.42558003 68.01422997 70.24327014 50.15619337 74.35123619 70.29446148
 78.29588784 69.89686297 65.05920883 78.48627593 57.14323453 62.51578235
 59.57022666 57.42532197 62.88786342 47.05861616 80.67801036 47.21312585
 75.67896334 46.49285903 67.32307323 41.16728141 70.46743599 61.74888273
 59.17085242 60.26210724 60.26210724 66.24210251 72.34075423 56.71020538
 50.15619337 48.70941391 53.29816661 46

37    -137.043347
140      2.545303
72     -51.225656
137    -13.312733
203   -107.117690
133   -106.174465
79     -48.173039
192    -14.572064
144    -19.409718
129    -52.982651
204    -80.325692
71     -43.953144
134    -64.898700
25      -7.043605
178     -8.581063
20     -20.410310
101    -72.790916
146    -61.255801
212      2.210037
139    -40.976068
156     -6.145853
157    -69.301645
142    -19.001491
50     -89.720044
68     -84.298074
215    -21.206819
215    -21.206819
96     -91.226824
86     -66.128172
141    -41.758721
137    -13.312733
7      -44.759513
63      -8.170760
61     -13.521616
22     -10.113535
57      -2.295046
1      -15.341061
128    -86.585552
60     -24.908015
209    -82.823021
8       10.957782
216     -2.397279
141    -41.758721
115    -81.319117
Name: 1, dtype: float64

In [None]:
# Random forest