In [1]:
import pandas as pd
import statistics
from sklearn import preprocessing
from sklearn.utils import shuffle
from sklearn.dummy import DummyRegressor
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score

In [2]:
####### get feature table for ml
###### half-life values in log phase for leaderless transcript

hl_c4_raw = pd.read_csv("../../feature/FeatureTables/featureTable_combinedSelected_halfLifeValues/HalfLifeVals_logPhase_combinedSelected_leaderless_byLeaderlessLogPhase_jointlyByLogPhaseHypoxia.csv", header = 0)

hl_c4 = hl_c4_raw.iloc[:, 2:]
hl_c4 = hl_c4.dropna()
X_c4 = hl_c4.iloc[:, :-1]
y_c4 = hl_c4.HalfLife_vals_logPhase

print(X_c4.shape)
print(X_c4.shape[1])
print(y_c4.shape)
# X_c4.head()
# y_c4.value_counts()

(866, 244)
244
(866,)


In [3]:
####### K-Folds split

def kf(X, y, n_fold):
    train_kf_idex = []
    test_kf_idex = []
    kf = KFold(n_splits = n_fold, shuffle = True)

    for train_index, test_index in kf.split(X, y):
        train_kf_idex.append(train_index)
        test_kf_idex.append(test_index)
    return train_kf_idex, test_kf_idex

In [4]:
####### k fold cross-validation

def cv_training(X, y, n_fold):
    
    train_idex, test_idex = kf(X, y, n_fold)
    
    r2 = {}
    for regr in ['MeanRegression', 'LinearRegression', 'LassoRegression', 'SupportVector', 'DecisionTree',
                'RandomForest']:
        r2[regr] = []
    
    for i in range(n_fold):
        X_train = X.iloc[train_idex[i]]
        y_train = y.iloc[train_idex[i]]
        X_test = X.iloc[test_idex[i]]
        y_test = y.iloc[test_idex[i]]
        
        std_scaler = preprocessing.StandardScaler()
        X_train_std = std_scaler.fit_transform(X_train)
        X_test_std = std_scaler.transform(X_test)
        
        regr_mean = DummyRegressor(strategy = 'mean')
        regr_mean.fit(X_train, y_train)
        y_pred_strat = regr_mean.predict(X_test)
        r2_mean = r2_score(y_test, y_pred_strat, multioutput = 'uniform_average')
        r2['MeanRegression'].append(r2_mean)
        
        regr_liner = LinearRegression()
        regr_liner.fit(X_train_std, y_train)
        y_pred_liner = regr_liner.predict(X_test_std)
        r2_liner = r2_score(y_test, y_pred_liner, multioutput = 'uniform_average')
        r2['LinearRegression'].append(r2_liner)
        
        regr_lasso = linear_model.Lasso(alpha=0.1)
        regr_lasso.fit(X_train_std, y_train)
        y_pred_lasso = regr_lasso.predict(X_test_std)
        r2_lasso = r2_score(y_test, y_pred_liner, multioutput = 'uniform_average')
        r2['LassoRegression'].append(r2_lasso)

        regr_svm = svm.SVR(kernel = 'rbf')
        regr_svm.fit(X_train_std, y_train)
        y_pred_svm = regr_svm.predict(X_test_std)
        r2_svm = r2_score(y_test, y_pred_svm, multioutput = 'uniform_average')
        r2['SupportVector'].append(r2_svm)
        
        regr_dt = DecisionTreeRegressor()
        regr_dt.fit(X_train, y_train)
        y_pred_dt = regr_dt.predict(X_test)
        r2_dt = r2_score(y_test, y_pred_dt, multioutput = 'uniform_average')
        r2['DecisionTree'].append(r2_dt)
        
        regr_rf = RandomForestRegressor(n_jobs = -1)
        regr_rf.fit(X_train, y_train)
        y_pred_rf = regr_rf.predict(X_test)
        r2_rf = r2_score(y_test, y_pred_rf, multioutput = 'uniform_average')
        r2['RandomForest'].append(r2_rf)
        
    return r2

In [5]:
####### repeated runs

def repeat_cv_training(X, y, n_fold, n_times):
    
    r2_repeat = {}
    for regr in ['MeanRegression', 'LinearRegression', 'LassoRegression', 'SupportVector', 'DecisionTree',
                'RandomForest']:
        r2_repeat[regr] = []
        
    for i in range(n_times):
        print('training: ' + str(i))
        cv_r2_i = cv_training(X, y, n_fold)
        for regr, r2 in cv_r2_i.items():
            r2_repeat[regr].append(r2)
            
    return r2_repeat

In [6]:
####### test with single run

cv_r2 = cv_training(X_c4, y_c4, 5)
for regr, r2 in cv_r2.items():
    print("%s\t%.4f\t%.4f" % (regr, statistics.mean(r2), statistics.stdev(r2)))

MeanRegression	-0.0161	0.0218
LinearRegression	-2.1522	3.8846
LassoRegression	-2.1522	3.8846
SupportVector	0.0225	0.0483
DecisionTree	-0.9821	0.5195
RandomForest	0.0117	0.0747


In [7]:
####### running 5 fold cross-validation, repeated 10 times

cv_r2_repeat = repeat_cv_training(X_c4, y_c4, 5, 10)
# print(cv_r2_repeat)

training: 0
training: 1
training: 2
training: 3
training: 4
training: 5
training: 6
training: 7
training: 8
training: 9


In [8]:
####### get averaged R2 for each regressor

for regr, r2 in cv_r2_repeat.items():
    r2_flat = [r2_ij for r2_i in r2 for r2_ij in r2_i]
    print(regr, statistics.mean(r2_flat), min(r2_flat), max(r2_flat), statistics.stdev(r2_flat))

MeanRegression -0.006004589019691484 -0.04323276734400605 -2.686184587874507e-06 0.009603439106821892
LinearRegression -1.0215788714708348e+21 -5.1078943573541745e+22 -0.13053388118763887 7.223653475339278e+21
LassoRegression -1.0215788714708348e+21 -5.1078943573541745e+22 -0.13053388118763887 7.223653475339278e+21
SupportVector 0.04517753667011942 -0.042628838555059945 0.11749340372502826 0.03991898994398041
DecisionTree -0.9355039926557028 -1.9291627773947781 -0.13598075232997942 0.3955596473547914
RandomForest 0.053645491670201106 -0.09611116280184984 0.14260280006072257 0.05286778511303574
