In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_prepared.csv')

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = np.log1p(df_train['salary_vnd'])
y_val = np.log1p(df_val['salary_vnd'])
y_test = np.log1p(df_test['salary_vnd'])
y_full_train = np.log1p(df_full_train['salary_vnd'])

del df_train['salary_vnd']
del df_val['salary_vnd']
del df_test['salary_vnd']
del df_full_train['salary_vnd']

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error

In [3]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [4]:
rf = RandomForestRegressor(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
round(root_mean_squared_error(y_val, y_pred),3)

0.321

In [10]:
scores_with_max_depth = []

for d in np.arange(5, 51, 5):
    rf = RandomForestRegressor(n_estimators=45, max_depth=d, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores_with_max_depth.append((n, score))
    print(f'With max_depth: {d}, rmse: {score}')

With max_depth: 5, rmse: 0.306
With max_depth: 10, rmse: 0.286
With max_depth: 15, rmse: 0.286
With max_depth: 20, rmse: 0.286
With max_depth: 25, rmse: 0.286
With max_depth: 30, rmse: 0.286
With max_depth: 35, rmse: 0.287
With max_depth: 40, rmse: 0.287
With max_depth: 45, rmse: 0.287
With max_depth: 50, rmse: 0.288
