In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from IPython.display import clear_output

import time

In [4]:
pheno = pd.read_csv(r"feno.txt")
geno = pd.read_csv(r"geno.txt")
pheno_names = ["Cadmium_Chloride", 'Congo_red', 'Cycloheximide', 'Diamide',  'Ethanol', 'Hydroquinone', 'Lithium_Chloride',
              'Maltose', 'Neomycin', 'Tunicamycin', "Galactose", "YNB:ph3"]
pheno_12 = pheno[pheno_names]

In [5]:
import pickle
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 300, stop = 800, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(75, 150, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

  # Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
#params = []
results = []
t0 = time.time()
i = 0
for name, y in pheno_12.iteritems():
  i += 1
  clear_output(wait=True)
  print('Fitting fenotype: {} ({} of {}).'.format(name, i, 2))
  geno_c = geno.copy()
  missing_phenos = y[ y.isnull() ].index.values
  geno_c = geno_c.drop(missing_phenos, axis = 0)
  y = y.drop(missing_phenos, axis = 0)
  
  X_train, X_test, y_train, y_test = train_test_split(geno_c, y, test_size=0.15)
  X_train = X_train.drop(columns = ["Unnamed: 0"]).values
  X_test = X_test.drop(columns = ["Unnamed: 0"]).values

  y_train_std = (y_train - np.mean(y_train)) / np.std(y_train)
  y_test_std = (y_test - np.mean(y_train)) / np.std(y_train)

  rf = RandomForestRegressor()
  # Random search of parameters, using 3 fold cross validation, 
  # search across 100 different combinations, and use all available cores
  rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 15, \
                                cv = 3, verbose=10, random_state=42, n_jobs = -1)
  # Fit the random search model
  rf_random.fit(X_train, y_train_std)

  #params.append((pheno_names[i], rf_random.best_params_))
  best_random = rf_random.best_estimator_
  rf_predictions = best_random.predict(X_test)
  r2 = r2_score(y_test_std, rf_predictions)
  mse = mean_squared_error(y_test_std, rf_predictions)

  results.append((name, rf_random.best_params_, r2, mse))
  pickle.dump( results, open( './results.p', 'wb' ) )
t1 = time.time()

Fitting fenotype: YNB:ph3 (12 of 2).
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done  35 out of  45 | elapsed: 19.4min remaining:  5.6min
[Parallel(n_jobs=-1)]: Done  40 out of  45 | elapsed: 21.2min remaining:  2.6min
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 29.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed: 29.9min finished


In [43]:
pheno_12.shape

(1008, 12)

In [6]:
results

[('Cadmium_Chloride',
  {'n_estimators': 633,
   'min_samples_split': 4,
   'min_samples_leaf': 4,
   'max_features': 'auto',
   'max_depth': 120,
   'bootstrap': False},
  0.6932139419444644,
  0.3046069038401083),
 ('Congo_red',
  {'n_estimators': 411,
   'min_samples_split': 2,
   'min_samples_leaf': 4,
   'max_features': 'sqrt',
   'max_depth': 75,
   'bootstrap': False},
  0.375258670608392,
  0.6363651780568691),
 ('Cycloheximide',
  {'n_estimators': 633,
   'min_samples_split': 2,
   'min_samples_leaf': 3,
   'max_features': 'sqrt',
   'max_depth': 142,
   'bootstrap': False},
  0.47368493690916935,
  0.5440503186252927),
 ('Diamide',
  {'n_estimators': 411,
   'min_samples_split': 6,
   'min_samples_leaf': 2,
   'max_features': 'sqrt',
   'max_depth': 97,
   'bootstrap': False},
  0.3160251487289988,
  0.6943901733474739),
 ('Ethanol',
  {'n_estimators': 300,
   'min_samples_split': 4,
   'min_samples_leaf': 1,
   'max_features': 'sqrt',
   'max_depth': 75,
   'bootstrap': True

In [8]:
pd.DataFrame.from_dict(results)

Unnamed: 0,0,1,2,3
0,Cadmium_Chloride,"{'n_estimators': 633, 'min_samples_split': 4, ...",0.693214,0.304607
1,Congo_red,"{'n_estimators': 411, 'min_samples_split': 2, ...",0.375259,0.636365
2,Cycloheximide,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.473685,0.54405
3,Diamide,"{'n_estimators': 411, 'min_samples_split': 6, ...",0.316025,0.69439
4,Ethanol,"{'n_estimators': 300, 'min_samples_split': 4, ...",0.524683,0.352485
5,Hydroquinone,"{'n_estimators': 522, 'min_samples_split': 2, ...",0.159427,0.683375
6,Lithium_Chloride,"{'n_estimators': 522, 'min_samples_split': 2, ...",0.520215,0.473867
7,Maltose,"{'n_estimators': 355, 'min_samples_split': 2, ...",0.701258,0.3717
8,Neomycin,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.55609,0.423145
9,Tunicamycin,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.40641,0.603238


In [13]:
file = open('results.p',"rb")
r = pickle.load(file)


In [15]:
df = pd.DataFrame(results, columns =['Name', 'Params', 'r2', 'mse'])


In [16]:
df

Unnamed: 0,Name,Params,r2,mse
0,Cadmium_Chloride,"{'n_estimators': 633, 'min_samples_split': 4, ...",0.693214,0.304607
1,Congo_red,"{'n_estimators': 411, 'min_samples_split': 2, ...",0.375259,0.636365
2,Cycloheximide,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.473685,0.54405
3,Diamide,"{'n_estimators': 411, 'min_samples_split': 6, ...",0.316025,0.69439
4,Ethanol,"{'n_estimators': 300, 'min_samples_split': 4, ...",0.524683,0.352485
5,Hydroquinone,"{'n_estimators': 522, 'min_samples_split': 2, ...",0.159427,0.683375
6,Lithium_Chloride,"{'n_estimators': 522, 'min_samples_split': 2, ...",0.520215,0.473867
7,Maltose,"{'n_estimators': 355, 'min_samples_split': 2, ...",0.701258,0.3717
8,Neomycin,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.55609,0.423145
9,Tunicamycin,"{'n_estimators': 633, 'min_samples_split': 2, ...",0.40641,0.603238


In [17]:
df.to_csv('results.csv')