Predictive models

In [1]:
import pandas as pd
# Load ABIDE data
url = 'https://github.com/jcbeer/predictiveViz/raw/master/data/abide_5252.csv'
df = pd.read_csv(url, index_col=False, header=None)
# first column is ID
# second column is Social Responsiveness Scale Score
# remaining columns are voxel level data
print(df.head(5))
df.shape

      0     1       2       3       4       5       6       7       8     \
0  50433.0   5.0 -0.0621 -0.0703  0.1083  0.1345 -0.0365  0.0839  0.0589   
1  50434.0  22.0  0.1438 -0.0196  0.0686  0.1574 -0.0495 -0.3282 -0.1005   
2  50435.0   6.0  0.1437 -0.0697  0.0613  0.1556  0.0228 -0.0978 -0.2464   
3  50436.0   9.0  0.2942  0.1377  0.1333  0.2336  0.1447  0.3166  0.1992   
4  50441.0  31.0  0.2639 -0.1567 -0.0040  0.3274  0.0209  0.0368 -0.1008   

     9      ...      5244    5245    5246    5247    5248    5249    5250  \
0  0.1294   ...    0.0027  0.0989  0.2442  0.2974  0.2802  0.3082  0.0073   
1 -0.2973   ...    0.2642  0.2452  0.2902  0.3234  0.0482  0.1741  0.3826   
2 -0.0511   ...    0.2201  0.1599 -0.0269 -0.0595 -0.0038  0.1384  0.1759   
3  0.1069   ...    0.4682  0.3324  0.0662  0.2098  0.2361  0.1049  0.0885   
4  0.0468   ...    0.2101 -0.1521 -0.0809 -0.1130  0.1173  0.1127  0.0463   

     5251    5252    5253  
0 -0.0596  0.1995  0.0153  
1 -0.0497 -0.1187 -0.024

(219, 5254)

In [3]:
# reorganize data
subid = df.iloc[:,0].astype(int)
y = df.iloc[:,1]
X = df.iloc[:,2:]

In [4]:
# randomly split into training and test
import math
# set proportion that is test set
test_proportion = 0.2
# set number that is test set
total_n = df.shape[0]
test_n = math.ceil(test_proportion*total_n)
test_n
# randomly select test set
import numpy as np
# set random seed
np.random.seed(seed=1)
# divide indicies into training and test
test_ids = np.random.choice(total_n, test_n)
train_ids = np.delete(np.arange(df.shape[0] - 1), test_ids)
# divide data into training and test 
subid_train = subid.iloc[train_ids]
subid_test = subid.iloc[test_ids]
y_train = y.iloc[train_ids]
y_test = y.iloc[test_ids]
X_train = X.iloc[train_ids]
X_test = X.iloc[test_ids]

In [5]:
X.shape

(219, 5252)

In [9]:
# center the y data
# based on training set mean
y_train_mean = y_train.mean()
y_train_centered = y_train - y_train_mean
y_test_centered = y_test - y_train_mean
# standardize X data to have column mean zero and SD one
# based on training set column means and SD
from sklearn.preprocessing import StandardScaler
X_sc = StandardScaler()
X_sc.fit(X_train)
X_train_std = X_sc.transform(X_train)
X_test_std = X_sc.transform(X_test)

In [10]:
y_train_centered.mean()

3.853791109207323e-15

In [38]:
# Elastic net
from sklearn.linear_model import ElasticNet
regr = ElasticNet(random_state=1)
regr.fit(X_train_std, y_train_centered)
ElasticNet(alpha=1, copy_X=True, fit_intercept=False, l1_ratio=0.25,
      max_iter=10000, normalize=True, positive=False, precompute=False,
      random_state=1, selection='random', tol=0.0001, warm_start=False)



ElasticNet(alpha=1, copy_X=True, fit_intercept=False, l1_ratio=0.25,
      max_iter=10000, normalize=True, positive=False, precompute=False,
      random_state=1, selection='random', tol=0.0001, warm_start=False)

In [36]:
print(regr.coef_) 
print(regr.intercept_) 
print(regr.predict(X_test)) 

[-0.         -0.         -0.32648086 ...  0.         -0.
  0.        ]
-1.539054097540146e-15
[ 6.67098822e+00  3.47545679e+00  6.73786818e+00 -2.32608616e+00
  1.82262783e+00  6.88173486e+00  3.58711639e+00  5.88815993e+00
  4.90229615e+00  6.34767186e+00 -2.66524217e+00 -3.93584162e+00
  4.24058155e+00  1.36031644e+00 -1.41363287e-03 -4.05306832e+00
  8.05412566e+00 -2.40636594e+00  4.08642025e+00 -5.33699395e+00
  1.98528649e+00 -6.06252898e+00  1.31314336e+00  2.28800117e+00
  2.77167188e+00 -3.54981580e+00 -3.54981580e+00  3.34884019e+00
  2.96823500e+00 -1.92721794e+00 -2.32608616e+00 -3.24842495e+00
 -3.21713929e+00 -7.06592908e+00 -2.12451305e+00 -1.87722741e+00
  2.57906809e+00  2.84771564e+00  9.25876078e-01  7.65459380e-01
  7.60057152e+00  2.28064872e+00 -1.92721794e+00  1.23836622e+00]


In [31]:
# calculate prediction error
pred_y_test = regr.predict(X_test)
print(y_test)
print(pred_y_test + y_train_mean)
np.subtract(pred_y_test, y_test_centered)

37     148.0
140      7.0
72      63.0
137      5.0
203    123.0
133    118.0
79      68.0
192     26.0
144     26.0
129     73.0
204     79.0
71      48.0
134     66.0
25       6.0
178     13.0
20       9.0
101     95.0
146     50.0
212     15.0
139     29.0
156     15.0
157     52.0
142     31.0
50      93.0
68      85.0
215     23.0
215     23.0
96      99.0
86      80.0
141     40.0
137      5.0
7       35.0
63       3.0
61       2.0
22       9.0
57       4.0
1       22.0
128     90.0
60      35.0
209     77.0
8        6.0
216     12.0
141     40.0
115     82.0
Name: 1, dtype: float64
[69.42558003 68.01422997 70.24327014 50.15619337 74.35123619 70.29446148
 78.29588784 69.89686297 65.05920883 78.48627593 57.14323453 62.51578235
 59.57022666 57.42532197 62.88786342 47.05861616 80.67801036 47.21312585
 75.67896334 46.49285903 67.32307323 41.16728141 70.46743599 61.74888273
 59.17085242 60.26210724 60.26210724 66.24210251 72.34075423 56.71020538
 50.15619337 48.70941391 53.29816661 46

37    -78.574420
140    61.014230
72      7.243270
137    45.156193
203   -48.648764
133   -47.705539
79     10.295888
192    43.896863
144    39.059209
129     5.486276
204   -21.856765
71     14.515782
134    -6.429773
25     51.425322
178    49.887863
20     38.058616
101   -14.321990
146    -2.786874
212    60.678963
139    17.492859
156    52.323073
157   -10.832719
142    39.467436
50    -31.251117
68    -25.829148
215    37.262107
215    37.262107
96    -32.757897
86     -7.659246
141    16.710205
137    45.156193
7      13.709414
63     50.298167
61     44.947311
22     48.355392
57     56.173881
1      43.127866
128   -28.116625
60     33.560911
209   -24.354094
8      69.426709
216    56.071648
141    16.710205
115   -22.850190
Name: 1, dtype: float64

In [None]:
# Random forest