In [5]:
import pandas as pd
import numpy as np

df = pd.read_csv("clean_data/final_product.csv")

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import math

X = df.drop('OilPeakRate', axis=1)
y = df['OilPeakRate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'max_depth': [10, 20, 30, 40],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

dtree = DecisionTreeRegressor()

grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=3, n_jobs=20, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print('Root Mean Squared Error:', math.sqrt(mse))


Fitting 3 folds for each of 36 candidates, totalling 108 fits
{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Root Mean Squared Error: 115.18827159857402


In [7]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=20, verbose=2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

best_grid = grid_search.best_estimator_

y_pred = best_grid.predict(X_test)

mse = mean_squared_error(y_test, y_pred)

print('Root Mean Squared Error:', math.sqrt(mse))

[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=2; total time=   0.3s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2; total time=   0.5s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5; total time=   0.4s
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=5; total time=   0.4s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5; total time=   0.5s
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=10; total time=   0.4s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=10; total time=   0.4s
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=10; total time=   0.4s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=10; total time=   0.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5; total time=   0.3s

Fitting 3 folds for each of 144 candidates, totalling 432 fits
{'max_depth': 40, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 400}
Root Mean Squared Error: 99.1705978009789
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time= 1.6min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time= 2.1min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time= 1.1min
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  51.8s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.1min
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 1.9min
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 1.0min

[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  27.0s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time= 1.5min
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 1.0min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time= 1.1min
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time= 2.6min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  28.1s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=400; total time= 1.5min
[CV] END max_depth=20, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 2.0min
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=  33.4s
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total t

[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.0min
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time= 2.1min
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time= 1.6min
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  34.6s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time= 1.4min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 2.0min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.1min
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time= 2.4min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total ti

[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  26.8s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  57.8s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.9min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 1.5min
[CV] END max_depth=40, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  29.5s
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=100; total time=  29.9s
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  27.7s
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=  55.7s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  58.0s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total t

[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.5min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.8min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time= 2.0min
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 2.3min
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  20.3s
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 1.4min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=10, n_estimators=300; total time= 1.9min
[CV] END max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time= 2.1min
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=10, n_estimators=400; total time= 1.8min
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total t

[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time= 1.4min
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 2.5min
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 1.5min
[CV] END max_depth=40, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time= 1.0min
[CV] END max_depth=40, min_samples_leaf=4, min_samples_split=10, n_estimators=400; total time= 1.7min
[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=  57.0s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  37.3s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=5, n_estimators=400; total time= 2.2min
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=100; total time=  33.0s
[CV] END max_depth=30, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total t