In [4]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from multiprocessing import Pool
from sklearn.kernel_ridge import KernelRidge
from functools import partial
import argparse

In [5]:
# load whole dataset (measurement + prediction)
dfwh = pd.read_csv('./Training_Data_H2.csv') #sorted
dfwh = dfwh.drop(['H2_se', 'Disease'], axis=1)

# using one hot encoding for the categorical data
dfwh = dfwh.iloc[0:1034,] ##Training data are at 0: 1034 rows

print dfwh.shape
dfwh = pd.get_dummies(dfwh)
dfwh = dfwh.drop(['Country_of_cohort_Malaysia', 'Country_of_cohort_India', 'Country_of_cohort_Venezuela', 'Country_of_cohort_Germany', 'Country_of_cohort_Brazil', 'Country_of_cohort_France', 'Country_of_cohort_Poland'], axis=1)
print dfwh.shape

(1034, 297)
(1034, 346)


In [6]:
# Prepare training and testing data
feature_wh = dfwh.iloc[:, 1:].values
value = dfwh.iloc[:, 0].values

# normalization
scaler = MinMaxScaler()
scaler.fit(feature_wh)
feature_scaled = scaler.transform(feature_wh)
print feature_scaled.shape

(1034, 345)


In [7]:
def reg_split(regressor, feature_scaled, value, seed, test_size=0.2):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        feature_scaled, value, test_size=test_size, random_state=seed)
    regressor.fit(Xtrain, Ytrain)
    Y_pred = regressor.predict(Xtest)
# #     make the code print the feature importance
#     if hasattr(regressor, 'feature_importances_'):
#         with open('./h2_{}_seed_{}'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(regressor.feature_importances_))
#     if hasattr(regressor, 'coef_'):
#         with open('./h2_{}_seed_{}'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(regressor.coef_))

    return np.corrcoef(Ytest, Y_pred)[0, 1]

In [9]:
def split_wrapper(seed, regressor):
    # gbr
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=0)
    # random forest
    rf = RandomForestRegressor(max_depth=25, random_state=0)
    # adboosted random forrest
    adrf = AdaBoostRegressor(RandomForestRegressor(max_depth=25, random_state=0, n_jobs=-1),
                              n_estimators=200, random_state=0)
    # boosted quantile
    bq = GradientBoostingRegressor(loss='quantile', alpha=0.13,
                                    n_estimators=200, max_depth=5,
                                    learning_rate=.2, min_samples_leaf=9,
                                    min_samples_split=9)
    # svr
    svr = SVR(kernel='rbf', gamma=0.01, C=10)
    # krr
    krr = KernelRidge(alpha=0.02, kernel='rbf', gamma=0.1)
    # lasso
    lasso = linear_model.Lasso(alpha=0.00001)
    # huber regression
    hr = linear_model.HuberRegressor(epsilon=10)
    # ridge
    ridge = linear_model.Ridge(alpha=0.1)
    reg_dict = {'gbr': gbr, 'rf': rf, 'adrf':adrf, 'bq': bq,
                'svr': svr,'lasso':lasso,'krr': krr, 'hr': hr,
                'ridge': ridge}
    return reg_split(reg_dict[regressor], feature_scaled, value, seed)

In [10]:
def repeat_1000(regressor):
    p = Pool()
    result = np.array(p.map(partial(split_wrapper, regressor=regressor), range(1000)))
    return result

In [11]:
print(repeat_1000('gbr'))  
##pick one name a time from {"gbr","rf","adrf","bq","svr","lasso","krr","hr","ridge"}
##print correlation coefficients from 1000 repeated runs, based on which we can compute mean value.

[ 0.85917855  0.8764764   0.87379697  0.86301541  0.89673423  0.88277897
  0.88029269  0.88399208  0.88547249  0.85037128  0.88478705  0.89842267
  0.87860628  0.8790743   0.88703751  0.89874421  0.84887685  0.85572593
  0.87670131  0.87861276  0.89265409  0.88474647  0.86062378  0.8805523
  0.86450166  0.8701716   0.89098615  0.88909336  0.88492894  0.87265892
  0.87285418  0.87003371  0.84189427  0.84833412  0.87098593  0.8632692
  0.84719265  0.86138413  0.8613933   0.85620456  0.86331405  0.8653829
  0.86771975  0.87684961  0.89454406  0.89394174  0.89112233  0.91003234
  0.85912955  0.88340098  0.88116712  0.86875233  0.87740777  0.90542012
  0.88687994  0.88601204  0.86684338  0.89522334  0.87834242  0.87951559
  0.87927408  0.84024173  0.89083692  0.84256823  0.87278887  0.86094239
  0.87103613  0.86230991  0.85512578  0.89890575  0.84999799  0.90134613
  0.84190167  0.83812938  0.87269122  0.86579616  0.88059647  0.85220424
  0.84230806  0.89877812  0.85227488  0.87385465  0.87