In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [25]:
X = df.drop("Profit", axis = 1)
y = df["Profit"]

In [31]:
np.random.seed(42)
model = RandomForestRegressor(n_jobs=-1)
X_train, y_train = X_train[:50], y_train[:50]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.shape, y_train.shape

((40, 3), (40,))

In [32]:
model.fit(X_train, y_train)

In [33]:
model.score(X_test, y_test)

0.9103164738430438

In [36]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {"Training MAE" : mean_absolute_error(y_train, train_preds),
              "Valid MAE" : mean_absolute_error(y_test, val_preds),
              "Training RMSLE" : rmsle(y_train, train_preds),
              "Valid RMSLE" : rmsle(y_test, val_preds),
              "Training R^2" : model.score(X_train, y_train),
              "Valid R^2" : model.score(X_test, y_test)
    }
    return scores

In [37]:
model.fit(X_train, y_train)
show_scores(model)

{'Training MAE': 2949.7971725000034,
 'Valid MAE': 6245.53884999997,
 'Training RMSLE': 0.11952233238027139,
 'Valid RMSLE': 0.14245221197083813,
 'Training R^2': 0.9905817650561102,
 'Valid R^2': 0.9078468501723597}

In [42]:
from sklearn.model_selection import RandomizedSearchCV

rf_grid = {"n_estimators" : np.arange(10, 100, 10),
           "max_depth" : [None, 3, 5],
           "min_samples_split" : np.arange(2, 20, 2),
           "min_samples_leaf" : np.arange(1, 20, 2),
           "max_features" : [0.5, 1.0, "sqrt"]
}
rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions = rf_grid,
                              n_iter = 10,
                              cv = 5,
                              verbose = True)
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [44]:
rs_model.best_params_

{'n_estimators': 70,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_features': 1.0,
 'max_depth': 3}

In [45]:
show_scores(rs_model)

{'Training MAE': 4585.796937329812,
 'Valid MAE': 7199.22213190381,
 'Training RMSLE': 0.17094709537257655,
 'Valid RMSLE': 0.15816581235411523,
 'Training R^2': 0.9769762327389722,
 'Valid R^2': 0.8837527774032745}

In [46]:
ideal_model = RandomForestRegressor(n_estimators = 70,
        min_samples_split = 4,
        min_samples_leaf = 1,
        max_features =  1.0,
        max_depth = 3)
ideal_model.fit(X_train, y_train)

In [47]:
show_scores(ideal_model)

{'Training MAE': 4710.150974440747,
 'Valid MAE': 7020.500442479551,
 'Training RMSLE': 0.15968127857556402,
 'Valid RMSLE': 0.16688191112161832,
 'Training R^2': 0.9781642939007622,
 'Valid R^2': 0.8771569085463107}

In [1]:
test_preds = ideal_model.predict(df.drop("Profit", axis = 1))

NameError: name 'ideal_model' is not defined