In [110]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score as auc, accuracy_score
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import numpy as np
from pathlib import Path
from func import rm_high_corr_feat

# Model 1 - Агрегированные данные по командам

In [111]:
path = '../prepare_data/data/df_res.csv'
df = pd.read_csv(Path(path).resolve())
df = df[df['gameweek_compSeason_label'] > 2014]

In [114]:
train = df[df['gameweek_compSeason_label'] !=2023]
val = df[df['gameweek_compSeason_label'] ==2023]

y_train = train['team_1_hue']
x_train = train.drop('team_1_hue', axis=1)

y_val = val['team_1_hue']
x_val = val.drop('team_1_hue', axis=1)

In [115]:
cat = ['gameweek_gameweek',	'gameweek_compSeason_label', 'teams_team_1_name', 'teams_team_2_name', 'ground_name']

In [116]:
model_1 = CatBoostClassifier()


In [117]:
model_1.fit(x_train, y_train, cat_features=cat)


Learning rate set to 0.083411
0:	learn: 1.0801254	total: 23.1ms	remaining: 23.1s
1:	learn: 1.0627727	total: 56.8ms	remaining: 28.3s
2:	learn: 1.0485995	total: 91.8ms	remaining: 30.5s
3:	learn: 1.0356208	total: 126ms	remaining: 31.3s
4:	learn: 1.0242739	total: 159ms	remaining: 31.6s
5:	learn: 1.0154627	total: 194ms	remaining: 32.1s
6:	learn: 1.0061482	total: 228ms	remaining: 32.3s
7:	learn: 0.9983197	total: 261ms	remaining: 32.3s
8:	learn: 0.9906308	total: 294ms	remaining: 32.4s
9:	learn: 0.9845367	total: 328ms	remaining: 32.5s
10:	learn: 0.9789691	total: 362ms	remaining: 32.5s
11:	learn: 0.9741246	total: 396ms	remaining: 32.6s
12:	learn: 0.9691479	total: 430ms	remaining: 32.6s
13:	learn: 0.9648148	total: 464ms	remaining: 32.7s
14:	learn: 0.9606262	total: 499ms	remaining: 32.7s
15:	learn: 0.9567367	total: 532ms	remaining: 32.7s
16:	learn: 0.9526303	total: 565ms	remaining: 32.7s
17:	learn: 0.9493429	total: 599ms	remaining: 32.7s
18:	learn: 0.9463315	total: 633ms	remaining: 32.7s
19:	lear

<catboost.core.CatBoostClassifier at 0x7fbe49b37880>

In [118]:
pred = model_1.predict(x_val)
pred_proba = model_1.predict_proba(x_val)
y_val = y_val.to_numpy().reshape(-1, 1)

In [119]:
pred = np.concatenate((y_val, pred, pred_proba), axis=1)
df_pred = pd.DataFrame(pred, columns=['true', 'pred', '0_prob', '1_prob', '2_prob'])

In [120]:
df_pred['error'] = np.where(df_pred['true']==df_pred['pred'], 0, 1)

df_pred.head()

Unnamed: 0,true,pred,0_prob,1_prob,2_prob,error
0,0.0,0.0,0.677209,0.112024,0.210767,0
1,2.0,1.0,0.243165,0.635044,0.12179,1
2,2.0,1.0,0.220262,0.608821,0.170917,1
3,1.0,1.0,0.165765,0.722379,0.111856,0
4,1.0,1.0,0.043815,0.789161,0.167023,0


In [121]:
df_pred['error'].sum()

72

In [122]:
pred

array([[0.        , 0.        , 0.67720929, 0.11202365, 0.21076706],
       [2.        , 1.        , 0.24316539, 0.63504429, 0.12179032],
       [2.        , 1.        , 0.22026171, 0.60882147, 0.17091682],
       [1.        , 1.        , 0.16576498, 0.72237902, 0.11185601],
       [1.        , 1.        , 0.04381514, 0.78916137, 0.16702349],
       [1.        , 1.        , 0.10632408, 0.82437361, 0.06930231],
       [1.        , 0.        , 0.6331998 , 0.25274491, 0.11405529],
       [1.        , 1.        , 0.04578579, 0.75459734, 0.19961688],
       [1.        , 1.        , 0.07861106, 0.82785099, 0.09353795],
       [1.        , 0.        , 0.83832136, 0.09348636, 0.06819228],
       [1.        , 2.        , 0.27823784, 0.29386943, 0.42789274],
       [1.        , 1.        , 0.07769214, 0.50603818, 0.41626968],
       [0.        , 1.        , 0.19743388, 0.4668532 , 0.33571293],
       [1.        , 1.        , 0.14095417, 0.75472561, 0.10432022],
       [0.        , 0.        , 0.

In [123]:
accuracy_score(y_val, pred[:,1])

0.5609756097560976

# Модели на датасетах с отбором признаков 

In [None]:
thrs = [i/100 for i in range(100, 10, -5)] # Список значений корреляций для удаления признаков

In [None]:
acc_score_val, acc_score_train = [], []
for thr in tqdm(thrs):

    
    df_temp = rm_high_corr_feat(df, thr, 6)


    train = df[df['gameweek_compSeason_label'] !=2023]
    val = df[df['gameweek_compSeason_label'] ==2023]

    y_train = train['team_1_hue']
    x_train = train.drop('team_1_hue', axis=1)

    y_val = val['team_1_hue']
    x_val = val.drop('team_1_hue', axis=1)

    model_temp = CatBoostClassifier()
    model_temp.fit(x_train, y_train, cat_features=cat, verbose=False)

    pred_temp = model_temp.predict(x_val)
    pred_temp_train = model_temp.predict(x_train)

    score_temp = accuracy_score(pred_temp, y_val)
    score_temp_train = accuracy_score(pred_temp_train, y_train)

    #print(f'thr: {thr}\naccuracy: {np.round(score_temp, 3)}\nпризнаков: {df_temp.shape[1]}\n')

    acc_score_val.append(score_temp)
    acc_score_train.append(score_temp_train)


100%|██████████| 18/18 [12:38<00:00, 42.14s/it]


Возьмем датасет с отобранными признаки, где корреляция не выше 0.65 между признаками

In [None]:
df_temp = rm_high_corr_feat(df, 0.3, 6)

train = df[df['gameweek_compSeason_label'] !=2023]
val = df[df['gameweek_compSeason_label'] ==2023]

y_train = train['team_1_hue']
x_train = train.drop('team_1_hue', axis=1)

y_val = val['team_1_hue']
x_val = val.drop('team_1_hue', axis=1)

In [None]:
df_temp

Unnamed: 0,gameweek_gameweek,gameweek_compSeason_label,teams_team_1_name,teams_team_2_name,ground_name,team_1_hue,_accurate_back_zone_pass_team_1,_accurate_keeper_sweeper_team_1,_attempted_tackle_foul_team_1,_accurate_back_zone_pass_team_2,_accurate_keeper_sweeper_team_2,_attempted_tackle_foul_team_2
0,21,2023,Newcastle United,Manchester City,St. James' Park,0,5397.000000,19.625000,223.875000,9053.555556,23.222222,211.111111
1,21,2023,Manchester United,Tottenham Hotspur,Old Trafford,2,7671.666667,11.111111,245.666667,7906.777778,22.888889,237.111111
2,21,2023,Everton,Aston Villa,Goodison Park,2,6304.777778,14.333333,230.000000,5937.666667,19.000000,229.000000
3,21,2023,Chelsea,Fulham,Stamford Bridge,1,8834.333333,15.555556,215.666667,7333.000000,15.333333,231.333333
4,20,2023,Wolverhampton Wanderers,Everton,Molineux Stadium,1,7372.800000,10.800000,224.400000,6304.777778,14.333333,230.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2971,1,2015,Manchester United,Tottenham Hotspur,Old Trafford,1,7671.666667,11.111111,245.666667,7906.777778,22.888889,237.111111
2972,1,2015,Leicester City,Sunderland,King Power Stadium,1,6395.666667,21.444444,229.444444,4591.333333,33.000000,245.333333
2973,1,2015,Everton,Watford,Goodison Park,2,6304.777778,14.333333,230.000000,5134.166667,22.000000,248.833333
2974,1,2015,Chelsea,Swansea City,Stamford Bridge,2,8834.333333,15.555556,215.666667,6993.500000,16.250000,208.500000


In [None]:
df_temp.to_csv('../prepare_data/data/df_temp_65.csv', index=False)

In [None]:
train_pool = Pool(x_train, label=y_train, cat_features=cat)
val_pool = Pool(x_val, label=y_val, cat_features=cat)

model = CatBoostClassifier(loss_function='MultiClass')

param_grid = {
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7, 9],
    'learning_rate': [0.002, 0.01],
}

grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           cv=None,                                                      
                           )

grid_search.fit(x_train, y_train, cat_features=cat, eval_set=(x_val, y_val), verbose=1000)

print("Лучшие параметры:", grid_search.best_params_)
print("Лучшая оценка точности:", grid_search.best_score_)

0:	learn: 1.0981355	test: 1.0981216	best: 1.0981216 (0)	total: 7.77ms	remaining: 7.76s
999:	learn: 0.9513227	test: 0.9557163	best: 0.9557163 (999)	total: 8.61s	remaining: 0us

bestTest = 0.9557163021
bestIteration = 999

0:	learn: 1.0980912	test: 1.0980651	best: 1.0980651 (0)	total: 8.2ms	remaining: 8.19s
999:	learn: 0.9461221	test: 0.9519400	best: 0.9519386 (998)	total: 8.54s	remaining: 0us

bestTest = 0.9519386112
bestIteration = 998

Shrink model to first 999 iterations.
0:	learn: 1.0980997	test: 1.0981167	best: 1.0981167 (0)	total: 8.99ms	remaining: 8.98s
999:	learn: 0.9541937	test: 0.9548509	best: 0.9548509 (999)	total: 11.1s	remaining: 0us

bestTest = 0.9548509022
bestIteration = 999

0:	learn: 1.0981361	test: 1.0980757	best: 1.0980757 (0)	total: 8.83ms	remaining: 8.82s
999:	learn: 0.9586430	test: 0.9782417	best: 0.9769560 (892)	total: 10.9s	remaining: 0us

bestTest = 0.9769559795
bestIteration = 892

Shrink model to first 893 iterations.
0:	learn: 1.0981094	test: 1.0981636	best:

KeyboardInterrupt: 

In [None]:
model = CatBoostClassifier(learning_rate=0.01)

model.fit(x_train, y_train, cat_features=cat, eval_set=(x_val, y_val), plot=True,  verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fbe70432110>

In [None]:
accuracy_score(y_val, model.predict(x_val))


0.5487804878048781

## Data pickl

In [None]:
name_cols = 'pickle/name_cols.pickle'

col_pkl = list(df_temp.columns)
col_pkl.remove('team_1_hue')

with open(name_cols, 'wb') as file:
    pickle.dump(col_pkl, file)


In [None]:
model = CatBoostClassifier(iterations=2000, learning_rate=0.0005, loss_function='MultiClass', depth=4, l2_leaf_reg=6)
model.fit(x_train, y_train, cat_features=cat, verbose=False)

with open('pickle/catboost.pickle', 'wb') as file:
    pickle.dump(model, file)