In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('data_prepared.csv')

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
 
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_full_train = df_full_train.reset_index(drop=True)

y_train = np.log1p(df_train['salary_vnd'])
y_val = np.log1p(df_val['salary_vnd'])
y_test = np.log1p(df_test['salary_vnd'])
y_full_train = np.log1p(df_full_train['salary_vnd'])

del df_train['salary_vnd']
del df_val['salary_vnd']
del df_test['salary_vnd']
del df_full_train['salary_vnd']

In [7]:
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error

In [4]:
train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [5]:
features = dv.get_feature_names_out().tolist()

In [8]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [9]:
watchlist = [(dtrain, 'train'), (dval, 'val')]
scores = {}

In [10]:
def parse_xgb_output(output):
    results = []
 
    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')
 
        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])
 
        results.append((it, train, val))
     
    columns = ['num_iter', 'train_rmse', 'val_rmse']
    df_results = pd.DataFrame(results, columns=columns)
    return df_results

In [20]:
%%capture output
for e in [0.05, 0.1, 0.3, 0.5, 0.8, 1]:
    xgb_params = {
        'eta': e, 
        'max_depth': 10,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        'eval_metric': 'rmse',
        
        'seed': 42,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=200, verbose_eval=5, evals=watchlist)

    key = 'eta=%s' % (xgb_params['eta'])
    scores[key] = parse_xgb_output(output)

In [25]:
for e in [0.05, 0.1, 0.3, 0.5, 0.8, 1]:
    cols = 'eta=' + str(e)
    print(cols + '=====================')
    print(scores[cols].sort_values('val_rmse', ascending=True))
    print()
    print()

    num_iter  train_rmse  val_rmse
2         10     0.28437   0.28596
3         15     0.28350   0.28605
4         20     0.28281   0.28621
5         25     0.28208   0.28632
6         30     0.28140   0.28645
7         35     0.28066   0.28666
8         40     0.28003   0.28677
9         45     0.27937   0.28694
10        50     0.27882   0.28707
11        55     0.27818   0.28721
12        60     0.27749   0.28736
13        65     0.27696   0.28747
14        70     0.27631   0.28764
15        75     0.27571   0.28781
1          5     0.28728   0.28793
16        80     0.27505   0.28798
17        85     0.27430   0.28815
18        90     0.27361   0.28830
19        95     0.27310   0.28841
20       100     0.27257   0.28854
21       105     0.27204   0.28868
22       110     0.27163   0.28880
23       115     0.27098   0.28896
24       120     0.27036   0.28911
25       125     0.26980   0.28925
26       130     0.26932   0.28937
27       135     0.26877   0.28953
28       140     0.2