In [1]:
import pandas as pd
import numpy as np
import sklearn
#import tensorflow
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.model_selection import KFold
#import tflearn
from sklearn.svm import SVR
from multiprocessing import Pool
from sklearn.kernel_ridge import KernelRidge
from functools import partial

In [2]:
# load data: woH2b
df = pd.read_csv('./Training_Data_Corr.csv')
df = df.drop(['corr_SD', 'Disease1', 'Disease2'], axis=1)
# using one hot encoding to deal with the categorical data
df = pd.get_dummies(df)
df.shape

(5584, 497)

In [4]:
# Get data
feature = df.iloc[:, 1:].values
value = df.iloc[:, 0].values

# normalization
scaler = MinMaxScaler()
scaler.fit(feature)
feature_scaled = scaler.transform(feature)
print feature_scaled.shape

(5584, 496)


In [5]:
def reg_split(regressor, feature_scaled, value, seed, test_size=0.2):
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(
        feature_scaled, value, test_size=test_size, random_state=seed)
    regressor.fit(Xtrain, Ytrain)
    Y_pred = regressor.predict(Xtest)
# #     make the code print the feature importance
#     if hasattr(regressor, 'feature_importances_'):
#         with open('./corr_{}_seed_{}'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(regressor.feature_importances_))
#     if hasattr(regressor, 'coef_'):
#         with open('./corr_{}_seed_{}'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(regressor.coef_))
#     with open('./corr_{}_seed_{}_real'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(list(Ytest)))
#     with open('./corr_{}_seed_{}_pred'.format(
#             str(regressor.__class__).split('.')[-1].split('\'')[0], seed), 'w') as f:
#             f.write(str(list(Y_pred)))
    return np.corrcoef(Ytest, Y_pred)[0, 1]

In [6]:
def split_wrapper(seed, regressor):
    # gbr
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=0)
    # random forest
    rf = RandomForestRegressor(max_depth=25, random_state=0)
    # adboosted random forrest
    adrf = AdaBoostRegressor(RandomForestRegressor(max_depth=25, random_state=0, n_jobs=-1),
                              n_estimators=200, random_state=0)
    # boosted quantile
    bq = GradientBoostingRegressor(loss='quantile', alpha=0.13,
                                    n_estimators=200, max_depth=5,
                                    learning_rate=.2, min_samples_leaf=9,
                                    min_samples_split=9, random_state=0)
    # svr
    svr = SVR(kernel='rbf', gamma=0.13, C=10)
    # krr
    krr = KernelRidge(alpha=0.5, kernel='rbf', gamma=0.1)
    # lasso
    lasso = linear_model.Lasso(alpha=0.00001)
    # huber regression
    hr = linear_model.HuberRegressor(epsilon=10)
    # ridge
    ridge = linear_model.Ridge(alpha=0.1)
    reg_dict = {'gbr': gbr, 'rf': rf, 'adrf':adrf, 'bq': bq, 'svr': svr,
                   'krr': krr, 'hr': hr, 'ridge': ridge}
    return reg_split(reg_dict[regressor], feature_scaled, value, seed)

In [7]:
def repeat_1000(regressor):
    p = Pool()
    result = np.array(p.map(partial(split_wrapper, regressor=regressor), range(1000)))
    return result

In [8]:
print(repeat_1000('gbr'))
##pick one name a time from {"gbr","rf","adrf","bq","svr","lasso","krr","hr","ridge"}
##print correlation coefficients from 1000 repeated runs, based on which we can compute mean value.

[ 0.88020002  0.86657003  0.85990833  0.87814604  0.88501171  0.8863638
  0.85838448  0.84399948  0.8632723   0.89265895  0.88615922  0.86357819
  0.88890976  0.84555033  0.86556103  0.90258385  0.86165723  0.88791671
  0.86395912  0.86115074  0.8641894   0.89345659  0.86040079  0.86950166
  0.88305749  0.87774187  0.91202547  0.87986616  0.84495147  0.89411804
  0.88819761  0.87943409  0.88732982  0.84989517  0.86822123  0.88271033
  0.85323641  0.89695257  0.83022067  0.84774173  0.84862432  0.88437987
  0.89234936  0.86069045  0.8804212   0.87496727  0.87871225  0.88807752
  0.88407792  0.85389324  0.89201828  0.8789199   0.88313466  0.87059078
  0.88202284  0.87730122  0.89043752  0.88723369  0.87769148  0.89582398
  0.8665267   0.88156284  0.84497097  0.89653309  0.90529052  0.89405755
  0.8888627   0.83674596  0.9059838   0.88401048  0.90557887  0.90174616
  0.88492941  0.87286611  0.89607441  0.87220034  0.86157397  0.87772677
  0.86696351  0.87810048  0.86115117  0.87241446  0.