In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

from IPython.display import clear_output

import time

In [2]:
pheno = pd.read_csv(r"feno.txt")
geno = pd.read_csv(r"geno.txt")
pheno_names = ["Galactose", "YNB:ph3"]
pheno_12 = pheno[pheno_names]

In [0]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 300, stop = 800, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(75, 150, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

  # Create the random grid
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}
#params = []
results = []
t0 = time.time()
i = 0
for name, y in pheno_12.iteritems():
  i += 1
  clear_output(wait=True)
  print('Fitting fenotype: {} ({} of {}).'.format(name, i, 2))
  geno_c = geno.copy()
  missing_phenos = y[ y.isnull() ].index.values
  geno_c = geno_c.drop(missing_phenos, axis = 0)
  y = y.drop(missing_phenos, axis = 0)
  
  X_train, X_test, y_train, y_test = train_test_split(geno_c, y, test_size=0.15)
  X_train = X_train.drop(columns = ["Unnamed: 0"]).values
  X_test = X_test.drop(columns = ["Unnamed: 0"]).values

  y_train_std = (y_train - np.mean(y_train)) / np.std(y_train)
  y_test_std = (y_test - np.mean(y_train)) / np.std(y_train)

  rf = RandomForestRegressor()
  # Random search of parameters, using 3 fold cross validation, 
  # search across 100 different combinations, and use all available cores
  rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 15, \
                                cv = 3, verbose=10, random_state=42, n_jobs = -1)
  # Fit the random search model
  rf_random.fit(X_train, y_train_std)

  #params.append((pheno_names[i], rf_random.best_params_))
  best_random = rf_random.best_estimator_
  rf_predictions = best_random.predict(X_test)
  r2 = r2_score(y_test_std, rf_predictions)
  mse = mean_squared_error(y_test_std, rf_predictions)

  results.append((pheno_names[i], rf_random.best_params_, r2, mse))

t1 = time.time()

Fitting fenotype: Cadmium_Chloride (1 of 2).
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.7s


In [43]:
pheno_12.shape

(1008, 12)