In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_prepared.csv')

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = np.log1p(df_train['salary_vnd'])
y_val = np.log1p(df_val['salary_vnd'])
y_test = np.log1p(df_test['salary_vnd'])
y_full_train = np.log1p(df_full_train['salary_vnd'])

del df_train['salary_vnd']
del df_val['salary_vnd']
del df_test['salary_vnd']
del df_full_train['salary_vnd']

In [2]:
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error

In [3]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [4]:
features = dv.get_feature_names_out().tolist()

In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [6]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
scores = {}

In [7]:
def parse_xgb_output(output):
    results = []
 
    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')
 
        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])
 
        results.append((it, train, val))
     
    columns = ['num_iter', 'train_rmse', 'val_rmse']
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

In [12]:
%%capture output
for e in [0.05, 0.1, 0.3, 0.5, 0.8, 1]:
    xgb_params = {
        'eta': e, 
        'max_depth': 10,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': 'rmse',
        
        'seed': 42,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=201, verbose_eval=5, evals=watchlist)

    key = 'eta=%s' % (xgb_params['eta'])
    scores[key] = parse_xgb_output(output)

In [13]:
for e in [0.05, 0.1, 0.3, 0.5, 0.8, 1]:
    cols = 'eta=' + str(e)
    print(cols + '=====================')
    print(scores[cols].sort_values('val_rmse', ascending=True))
    print()
    print()

    num_iter  train_rmse  val_rmse
10         5     0.28459   0.28598
8          9     0.28460   0.28600
11         9     0.28347   0.28630
15         0     0.28656   0.28663
13         5     0.28387   0.28673
14         9     0.28262   0.28736
16         5     0.28376   0.28755
7          5     0.28728   0.28793
17         9     0.28237   0.28861
12         0     0.29216   0.29232
5          9     0.30294   0.30335
9          0     0.31989   0.32027
4          5     0.32389   0.32438
2          9     0.33326   0.33377
6          0     0.34887   0.34943
1          5     0.35464   0.35524
3          0     0.38412   0.38487
0          0     0.39372   0.39452


    num_iter  train_rmse  val_rmse
10         5     0.28459   0.28598
8          9     0.28460   0.28600
11         9     0.28347   0.28630
15         0     0.28656   0.28663
13         5     0.28387   0.28673
14         9     0.28262   0.28736
16         5     0.28376   0.28755
7          5     0.28728   0.28793
17         9     0

In [14]:
for e in [0.05, 0.1, 0.3, 0.5, 0.8, 1]:
    xgb_params = {
        'eta': e, 
        'max_depth': 10,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': ['rmse', 'mape', 'mae'],
        
        'seed': 42,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=200, verbose_eval=5, evals=watchlist)

[0]	train-rmse:0.39372	train-mape:0.01635	train-mae:0.31617	val-rmse:0.39452	val-mape:0.01639	val-mae:0.31690
[5]	train-rmse:0.35464	train-mape:0.01496	train-mae:0.28907	val-rmse:0.35524	val-mape:0.01499	val-mae:0.28961
[10]	train-rmse:0.32898	train-mape:0.01407	train-mae:0.27167	val-rmse:0.32949	val-mape:0.01409	val-mae:0.27204
[15]	train-rmse:0.31255	train-mape:0.01351	train-mae:0.26079	val-rmse:0.31298	val-mape:0.01353	val-mae:0.26106
[20]	train-rmse:0.30216	train-mape:0.01317	train-mae:0.25404	val-rmse:0.30258	val-mape:0.01318	val-mae:0.25430
[25]	train-rmse:0.29567	train-mape:0.01296	train-mae:0.24984	val-rmse:0.29611	val-mape:0.01297	val-mae:0.25015
[30]	train-rmse:0.29162	train-mape:0.01282	train-mae:0.24720	val-rmse:0.29211	val-mape:0.01284	val-mae:0.24757
[35]	train-rmse:0.28902	train-mape:0.01274	train-mae:0.24547	val-rmse:0.28961	val-mape:0.01276	val-mae:0.24593
[40]	train-rmse:0.28737	train-mape:0.01268	train-mae:0.24434	val-rmse:0.28807	val-mape:0.01271	val-mae:0.24492
[45

In [15]:
%%capture output
for depth in [5, 10, 15, 20, 25, 30]:
    xgb_params = {
        'eta': 0.5, 
        'max_depth': depth,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': 'rmse',
        
        'seed': 42,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=201, verbose_eval=5, evals=watchlist)

    key = 'depth=%s' % (xgb_params['max_depth'])
    scores[key] = parse_xgb_output(output)

In [16]:
for depth in [5, 10, 15, 20, 25, 30]:
    cols = 'depth=' + str(depth)
    print(cols + '=====================')
    print(scores[cols].sort_values('val_rmse', ascending=True))
    print()
    print()

    num_iter  train_rmse  val_rmse
16        80     0.28412   0.28588
49        40     0.28404   0.28588
48        35     0.28435   0.28589
17        85     0.28398   0.28589
19        95     0.28368   0.28590
..       ...         ...       ...
2         10     0.32898   0.32949
82         0     0.34887   0.34943
1          5     0.35464   0.35524
41         0     0.38412   0.38487
0          0     0.39372   0.39452

[246 rows x 3 columns]


    num_iter  train_rmse  val_rmse
16        80     0.28412   0.28588
49        40     0.28404   0.28588
48        35     0.28435   0.28589
17        85     0.28398   0.28589
19        95     0.28368   0.28590
..       ...         ...       ...
2         10     0.32898   0.32949
82         0     0.34887   0.34943
1          5     0.35464   0.35524
41         0     0.38412   0.38487
0          0     0.39372   0.39452

[246 rows x 3 columns]


    num_iter  train_rmse  val_rmse
16        80     0.28412   0.28588
49        40     0.28404   0.28588
48 

In [17]:
# Train final model
full_train_dicts = df_full_train.to_dict(orient='records')
X_full_train = dv.transform(full_train_dicts)
test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [18]:
dfulltrain = xgb.DMatrix(X_full_train, label=y_full_train, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [28]:
xgb_params = {
        'eta': 0.3, 
        'max_depth': 10,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': 'rmse',
        
        'seed': 42,
        'verbosity': 1,
    }
model = xgb.train(xgb_params, dfulltrain, num_boost_round=81, verbose_eval=5)

In [29]:
y_pred = model.predict(dtest)

In [30]:
rmse = round(root_mean_squared_error(y_test, y_pred),3)
r2 = round(r2_score(y_test, y_pred),3)
mape = round(mean_absolute_percentage_error(y_test, y_pred),3) * 100

In [31]:
print(f'rmse: {rmse}')
print(f'r2: {r2}')
print(f'mape: {mape}%')

rmse: 0.289
r2: 0.491
mape: 1.3%
