In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_prepared.csv')

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = np.log1p(df_train['salary_vnd'])
y_val = np.log1p(df_val['salary_vnd'])
y_test = np.log1p(df_test['salary_vnd'])
y_full_train = np.log1p(df_full_train['salary_vnd'])

del df_train['salary_vnd']
del df_val['salary_vnd']
del df_test['salary_vnd']
del df_full_train['salary_vnd']

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error

In [3]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [4]:
scores_with_depth = []

for depth in np.arange(1, 21, 1):
    dtr = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores_with_depth.append((depth, score))
    print(f'With max_depth: {depth}, rmse: {score}')

With max_depth: 1, rmse: 0.372
With max_depth: 2, rmse: 0.34
With max_depth: 3, rmse: 0.324
With max_depth: 4, rmse: 0.312
With max_depth: 5, rmse: 0.306
With max_depth: 6, rmse: 0.3
With max_depth: 7, rmse: 0.294
With max_depth: 8, rmse: 0.291
With max_depth: 9, rmse: 0.289
With max_depth: 10, rmse: 0.287
With max_depth: 11, rmse: 0.286
With max_depth: 12, rmse: 0.286
With max_depth: 13, rmse: 0.286
With max_depth: 14, rmse: 0.287
With max_depth: 15, rmse: 0.287
With max_depth: 16, rmse: 0.287
With max_depth: 17, rmse: 0.288
With max_depth: 18, rmse: 0.288
With max_depth: 19, rmse: 0.288
With max_depth: 20, rmse: 0.288


In [5]:
scores_with_depth_leaf = []

# Base on above result, i will choose list max_depth from 10 to 14 and find what max_leaf value is the best in this value list [5, 10, 15, 20, 25, 30]
for depth in np.arange(10, 15, 1):
    for leaf in [5, 10, 15, 20, 25, 30]:
        dtr = DecisionTreeRegressor(max_depth=depth, max_leaf_nodes=leaf, random_state=42)
        dtr.fit(X_train, y_train)
        y_pred = dtr.predict(X_val)
        score = round(root_mean_squared_error(y_val, y_pred),3)
        scores_with_depth_leaf.append((depth, leaf, score))
        print(f'With max_depth: {depth} and leaf: {leaf}, rmse: {score}')

With max_depth: 10 and leaf: 5, rmse: 0.312
With max_depth: 10 and leaf: 10, rmse: 0.289
With max_depth: 10 and leaf: 15, rmse: 0.286
With max_depth: 10 and leaf: 20, rmse: 0.286
With max_depth: 10 and leaf: 25, rmse: 0.286
With max_depth: 10 and leaf: 30, rmse: 0.286
With max_depth: 11 and leaf: 5, rmse: 0.312
With max_depth: 11 and leaf: 10, rmse: 0.289
With max_depth: 11 and leaf: 15, rmse: 0.286
With max_depth: 11 and leaf: 20, rmse: 0.286
With max_depth: 11 and leaf: 25, rmse: 0.286
With max_depth: 11 and leaf: 30, rmse: 0.286
With max_depth: 12 and leaf: 5, rmse: 0.312
With max_depth: 12 and leaf: 10, rmse: 0.289
With max_depth: 12 and leaf: 15, rmse: 0.286
With max_depth: 12 and leaf: 20, rmse: 0.286
With max_depth: 12 and leaf: 25, rmse: 0.286
With max_depth: 12 and leaf: 30, rmse: 0.286
With max_depth: 13 and leaf: 5, rmse: 0.312
With max_depth: 13 and leaf: 10, rmse: 0.289
With max_depth: 13 and leaf: 15, rmse: 0.286
With max_depth: 13 and leaf: 20, rmse: 0.286
With max_depth

In [6]:
columnnames = ['depth_num','leaf_num', 'rmse']
df_scores = pd.DataFrame(scores_with_depth_leaf, columns=columnnames)

In [7]:
df_scores.sort_values('rmse', ascending=True)

Unnamed: 0,depth_num,leaf_num,rmse
3,10,20,0.286
2,10,15,0.286
5,10,30,0.286
4,10,25,0.286
11,11,30,0.286
10,11,25,0.286
9,11,20,0.286
8,11,15,0.286
15,12,20,0.286
14,12,15,0.286


In [8]:
# As a result, max_depth = 10 and max_leaf_node = 15 are the best pair of values so far.
# With these values, I will use these 2 values and try tune further using min_samples_split and min_samples_leaf
scores_with_mss = []

for mss in np.arange(200, 2001, 200):
    dtr = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=15, min_samples_split=mss, random_state=42)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores_with_mss.append((mss, score))
    print(f'With min_samples_split: {mss}, rmse: {score}')

With min_samples_split: 200, rmse: 0.286
With min_samples_split: 400, rmse: 0.286
With min_samples_split: 600, rmse: 0.286
With min_samples_split: 800, rmse: 0.286
With min_samples_split: 1000, rmse: 0.286
With min_samples_split: 1200, rmse: 0.286
With min_samples_split: 1400, rmse: 0.286
With min_samples_split: 1600, rmse: 0.286
With min_samples_split: 1800, rmse: 0.286
With min_samples_split: 2000, rmse: 0.286


In [10]:
scores_with_msl = []

for msl in np.arange(1000, 10001, 1000):
    dtr = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=15, min_samples_leaf=msl, random_state=42)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores_with_msl.append((msl, score))
    print(f'With min_samples_leaf: {msl}, rmse: {score}')

With min_samples_leaf: 1000, rmse: 0.286
With min_samples_leaf: 2000, rmse: 0.286
With min_samples_leaf: 3000, rmse: 0.286
With min_samples_leaf: 4000, rmse: 0.286
With min_samples_leaf: 5000, rmse: 0.29
With min_samples_leaf: 6000, rmse: 0.294
With min_samples_leaf: 7000, rmse: 0.294
With min_samples_leaf: 8000, rmse: 0.304
With min_samples_leaf: 9000, rmse: 0.304
With min_samples_leaf: 10000, rmse: 0.304


In [11]:
for msl in np.arange(4000, 5001, 100):
    dtr = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=15, min_samples_leaf=msl, random_state=42)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_val)
    score = round(root_mean_squared_error(y_val, y_pred),3)
    scores_with_msl.append((msl, score))
    print(f'With min_samples_leaf: {msl}, rmse: {score}')

With min_samples_leaf: 4000, rmse: 0.286
With min_samples_leaf: 4100, rmse: 0.286
With min_samples_leaf: 4200, rmse: 0.286
With min_samples_leaf: 4300, rmse: 0.29
With min_samples_leaf: 4400, rmse: 0.29
With min_samples_leaf: 4500, rmse: 0.29
With min_samples_leaf: 4600, rmse: 0.29
With min_samples_leaf: 4700, rmse: 0.29
With min_samples_leaf: 4800, rmse: 0.29
With min_samples_leaf: 4900, rmse: 0.29
With min_samples_leaf: 5000, rmse: 0.29


In [12]:
# I will choose max_depth = 10, max_leaf_node = 15, min_samples_leaf = 4200 for final train
full_train_dicts = df_full_train.to_dict(orient='records')
X_full_train = dv.transform(full_train_dicts)
test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [22]:
dtr = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=15, min_samples_leaf=4200, random_state=42)
dtr.fit(X_full_train, y_full_train)
y_pred = dtr.predict(X_test)
rmse = round(root_mean_squared_error(y_test, y_pred),3)
r2 = round(r2_score(y_test, y_pred),3)
mape = round(mean_absolute_percentage_error(y_test, y_pred),3) * 100

In [23]:
print(f'rmse: {rmse}')
print(f'r2: {r2}')
print(f'mape: {mape}%')

rmse: 0.288
r2: 0.495
mape: 1.3%
