In [56]:
import pandas as pd
import numpy as np

from ydata_profiling import ProfileReport

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

data = pd.read_csv("csgo.csv")
data

# print(data.info())
# profile = ProfileReport(data, title="Diabetes Report", explorative=True)
# profile.to_file("report.html")

Unnamed: 0,map,day,month,year,date,wait_time_s,match_time_s,team_a_rounds,team_b_rounds,ping,kills,assists,deaths,mvps,hs_percent,points,result
0,Mirage,3,8,2018,03/08/2018,327,2906,16,13,215,17,2,21,2,5,45,Win
1,Mirage,2,8,2018,02/08/2018,336,2592,16,11,199,13,4,24,2,0,40,Lost
2,Mirage,31,7,2018,31/07/2018,414,2731,16,14,85,15,3,18,3,26,37,Win
3,Mirage,31,7,2018,31/07/2018,317,2379,11,16,93,12,2,15,2,16,30,Lost
4,Mirage,30,7,2018,30/07/2018,340,3467,15,15,94,33,5,20,5,30,83,Tie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128,Dust II,23,7,2015,23/07/2015,2,1573,16,4,46,12,2,20,0,16,26,Lost
1129,Dust II,23,7,2015,23/07/2015,29,2126,16,8,41,19,6,21,2,31,51,Lost
1130,Dust II,23,7,2015,23/07/2015,10,2555,11,16,17,9,3,20,2,55,30,Lost
1131,Dust II,23,7,2015,23/07/2015,9,2293,8,16,20,11,4,20,1,27,31,Lost


In [57]:
# 1) Drop team_a_rounds, team_b_rounds columns
data = data.drop(columns=['team_a_rounds', 'team_b_rounds'])

# 2) Convert date column into timestamp
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%Y').astype(np.int64) // 10 ** 9
data

Unnamed: 0,map,day,month,year,date,wait_time_s,match_time_s,ping,kills,assists,deaths,mvps,hs_percent,points,result
0,Mirage,3,8,2018,1533254400,327,2906,215,17,2,21,2,5,45,Win
1,Mirage,2,8,2018,1533168000,336,2592,199,13,4,24,2,0,40,Lost
2,Mirage,31,7,2018,1532995200,414,2731,85,15,3,18,3,26,37,Win
3,Mirage,31,7,2018,1532995200,317,2379,93,12,2,15,2,16,30,Lost
4,Mirage,30,7,2018,1532908800,340,3467,94,33,5,20,5,30,83,Tie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1128,Dust II,23,7,2015,1437609600,2,1573,46,12,2,20,0,16,26,Lost
1129,Dust II,23,7,2015,1437609600,29,2126,41,19,6,21,2,31,51,Lost
1130,Dust II,23,7,2015,1437609600,10,2555,17,9,3,20,2,55,30,Lost
1131,Dust II,23,7,2015,1437609600,9,2293,20,11,4,20,1,27,31,Lost


In [58]:
# 3) Separate features and target columns
target = 'points'
x = data.drop(labels=target, axis=1)
y = data[target]

# 4) Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# print(x_train.shape, x_test.shape)

print(x_train.shape, x_test.shape)

(906, 14) (227, 14)


In [59]:
# 5) Data preprocessing
pre_processor = ColumnTransformer(transformers=[
    ('num_features', StandardScaler(), ['date', 'wait_time_s', 'match_time_s', 'ping', 'kills', 'assists', 'deaths', 'mvps', 'hs_percent']),
    ('nom_features', OneHotEncoder(handle_unknown='ignore'), ['map', 'result']),
    ('ord_features', OrdinalEncoder(), ['day', 'month', 'year'])
])

# 6) Initialize model
model = Pipeline(steps=[
    ('pre_processor', pre_processor),
    ('classifier', RandomForestRegressor(random_state=100))
])

In [60]:
print(x_train['map'].unique())
print(x_test['map'].unique())

['Cache' 'Dust II' 'Inferno' 'Mirage' 'Overpass' 'Cobblestone' 'Austria'
 'Canals' 'Italy' 'Nuke']
['Inferno' 'Dust II' 'Cache' 'Mirage' 'Nuke' 'Overpass' 'Cobblestone']


In [61]:
# 7) Train model
# model.fit(x_train, y_train)

param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__criterion": ["squared_error", "absolute_error", "friedman_mse"],
    "classifier__max_depth": [None, 2, 5],
}
model_gr = GridSearchCV(estimator=model, param_grid=param_grid, scoring="r2", cv=6, verbose=2, n_jobs=-1)
model_gr.fit(x_train, y_train)

print("Best score: {}".format(model_gr.best_score_))
print("Best params: {}".format(model_gr.best_params_))

# 8) Test model
# y_predict = model.predict(x_test)

y_predict = model_gr.predict(x_test)
for i, j in zip(y_predict, y_test):
    print('Predicted value: {}. Actual value: {}'.format(i, j))

print("MAE: {}".format(mean_absolute_error(y_test, y_predict)))
print("MSE: {}".format(mean_squared_error(y_test, y_predict))) # dùng trong lúc train
print("R2: {}".format(r2_score(y_test, y_predict))) #là metric phổ biến nhất của regression, ko dùng trong lúc train

Fitting 6 folds for each of 27 candidates, totalling 162 fits
Best score: 0.9363901858910163
Best params: {'classifier__criterion': 'absolute_error', 'classifier__max_depth': None, 'classifier__n_estimators': 200}
Predicted value: 41.365. Actual value: 35
Predicted value: 37.3425. Actual value: 32
Predicted value: 41.85. Actual value: 40
Predicted value: 50.82. Actual value: 52
Predicted value: 39.18. Actual value: 37
Predicted value: 38.595. Actual value: 37
Predicted value: 38.385. Actual value: 32
Predicted value: 52.43. Actual value: 48
Predicted value: 43.915. Actual value: 47
Predicted value: 50.0575. Actual value: 49
Predicted value: 37.21. Actual value: 32
Predicted value: 50.35. Actual value: 42
Predicted value: 49.525. Actual value: 48
Predicted value: 53.905. Actual value: 48
Predicted value: 33.4. Actual value: 33
Predicted value: 32.825. Actual value: 32
Predicted value: 43.225. Actual value: 41
Predicted value: 27.875. Actual value: 28
Predicted value: 61.145. Actual valu