In [2]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.metrics import accuracy_score
import requests
from urllib.parse import urlencode
import numpy as np

from sklearn.linear_model import LogisticRegression
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')


In [3]:
# используем api yandex disk
base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?' 
public_key = 'https://disk.yandex.ru/d/jj6FG3ZFHdCWAQ' 
 
# получаем url 
final_url = base_url + urlencode(dict(public_key=public_key)) 
response = requests.get(final_url) 
download_url = response.json()['href'] 

In [4]:
matches = pd.read_csv(download_url)

In [5]:
import mlflow

mlflow.set_tracking_uri(uri="http://5.104.75.226:5000")
mlflow.set_experiment('EPL')

mlflow.autolog()

2024/01/17 20:22:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [6]:
matches.head()

Unnamed: 0,fbref_match_id,season,match_week,match_date,match_time,home_team_name,away_team_name,score,match_result,home_team_score,...,away_team_fouls,away_team_corners,away_team_crosses,away_team_touches,away_team_interceptions,away_team_aerials_won,away_team_offsides,away_team_goal_kicks,away_team_throw_ins,away_team_long_balls
0,d192bd78,2014-2015,1,2014-08-16,12:45,Manchester United,Swansea City,1:2,A,1,...,14,,28,,17,,1.0,,,
1,0a235dd1,2014-2015,1,2014-08-16,15:00,Stoke City,Aston Villa,0:1,A,0,...,14,,18,,17,,1.0,,,
2,36e3a824,2014-2015,1,2014-08-16,15:00,Leicester City,Everton,2:2,D,2,...,16,,16,,18,,,,,
3,49405949,2014-2015,1,2014-08-16,15:00,Queens Park Rangers,Hull City,0:1,A,0,...,7,,33,,10,,0.0,,,
4,5d6474b7,2014-2015,1,2014-08-16,15:00,West Ham United,Tottenham Hotspur,0:1,A,0,...,10,,28,,19,,,,,


In [7]:
matches.dtypes

fbref_match_id            object
season                    object
match_week                 int64
match_date                object
match_time                object
                          ...   
away_team_aerials_won    float64
away_team_offsides       float64
away_team_goal_kicks     float64
away_team_throw_ins      float64
away_team_long_balls     float64
Length: 65, dtype: object

In [8]:
matches['target'] = np.where(matches['match_result'] == 'H', 1, np.where(matches['match_result'] == 'D', 2, 0))

In [9]:
matches['match_date'] = pd.to_datetime(matches['match_date'])
matches['year'] = matches['match_date'].apply(lambda time: time.year)
matches['month'] = matches['match_date'].apply(lambda time: time.month)
matches['hour'] = matches['match_time'].str.replace(':.+', '', regex=True).astype('int')

In [10]:
features = ['home_team_name', 'away_team_name', 'year', 'month', 'hour', 'venue', 'home_team_manager', 'away_team_manager', 'season']
df = matches[features]
df['month'] = df['month'].astype('str')

target = matches['target']

In [11]:
y_train = matches[matches['year'] != 2023]['target']
y_val = matches[matches['year'] == 2023]['target']

In [12]:
df_oh = pd.get_dummies(df, drop_first=True, dtype=int)

x_train = df_oh[df_oh['year'] < 2023]
x_val = df_oh[df_oh['year'] == 2023]

df_oh.drop('year', axis=1)
df_oh.head()

Unnamed: 0,year,hour,home_team_name_Aston Villa,home_team_name_Bournemouth,home_team_name_Brentford,home_team_name_Brighton & Hove Albion,home_team_name_Burnley,home_team_name_Cardiff City,home_team_name_Chelsea,home_team_name_Crystal Palace,...,away_team_manager_Xisco,season_2015-2016,season_2016-2017,season_2017-2018,season_2018-2019,season_2019-2020,season_2020-2021,season_2021-2022,season_2022-2023,season_2023-2024
0,2014,12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Создание объекта PredefinedSplit
val_fold = [-1] * len(x_train) + [0] * len(x_val)
ps = PredefinedSplit(test_fold=val_fold)

model = LogisticRegression(random_state=1)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

In [14]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=ps)

In [15]:
grid_search.fit(df_oh, target)

2024/01/17 20:22:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6e3aa675ba4d448a868cea4ed4d30155', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/01/17 20:22:32 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


In [16]:
grid_search.best_score_

0.5327380952380952

In [17]:
grid_search.best_params_

{'C': 0.1}

In [18]:
grid_search.scorer_

make_scorer(accuracy_score)

In [19]:
model = LogisticRegression(C=0.1, random_state=1)
model.fit(x_train, y_train)
predict = model.predict(x_val)
accuracy_score(y_val, predict)

2024/01/17 20:22:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '841dc4cda11a473bbaa83114db5619e0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


0.5327380952380952

In [20]:
import pandas as pd
from catboost import CatBoostClassifier

In [21]:
x_val = df[df['year']==2023][features]
x_train = df[df['year']!=2023][features]

In [22]:
cat = [col for col in df.columns if col not in ['year', 'hour']]

In [23]:
model_cat = CatBoostClassifier(learning_rate=0.001)
model_cat.fit(x_train, y_train, cat_features=cat, verbose=False)
pred = model_cat.predict(x_val)
accuracy_score(y_val, pred)

0.5446428571428571

In [24]:
param_grid = {
    'learning_rate': [0.0005, 0.001, 0.005, 0.02],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7, 9]
}
grid_search = GridSearchCV(estimator=model_cat, param_grid=param_grid, scoring='accuracy', cv=ps)

In [25]:
grid_search.fit(df, target, cat_features=cat, verbose=False, eval_set=(x_val, y_val), plot=True)

2024/01/17 20:22:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '31af82734ab3466b961be3934b1682b8', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

2024/01/17 20:25:37 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.


In [26]:
grid_search.best_params_

{'depth': 4, 'l2_leaf_reg': 9, 'learning_rate': 0.02}

In [27]:
grid_search.best_score_

0.5476190476190477

# Агрегированные данные

In [28]:
path = 'prepare_data/data/df_res.csv'
df = pd.read_csv(Path(path).resolve())
df = df[df['gameweek_compSeason_label'] > 2014]

In [29]:
n = 300
train = df.iloc[n:, :]
val = df.iloc[:n, :]

y_train = train['team_1_hue']
x_train = train.drop('team_1_hue', axis=1)

y_val = val['team_1_hue']
x_val = val.drop('team_1_hue', axis=1)
x_train.shape[0]

2676

In [30]:
cat = ['gameweek_gameweek',	'gameweek_compSeason_label', 'teams_team_1_name', 'teams_team_2_name', 'ground_name']

In [31]:
model_1 = CatBoostClassifier()
model_1.fit(x_train, y_train, cat_features=cat, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f1c9a0b15a0>

In [32]:
pred = model_1.predict(x_val)
accuracy_score(y_val, pred)

0.5533333333333333

In [34]:
val_fold = [-1] * len(x_train) + [0] * len(x_val)
ps = PredefinedSplit(test_fold=val_fold)

In [35]:
param_grid = {
    'learning_rate': [0.001, 0.005, 0.02, 0.05],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [3, 5, 7, 9]
}
grid_search = GridSearchCV(estimator=model_1, param_grid=param_grid, scoring='accuracy', cv=ps)

In [36]:
grid_search.fit(df.drop('team_1_hue', axis=1), df['team_1_hue'], cat_features=cat, verbose=False, eval_set=(x_val, y_val), plot=True)

2024/01/17 20:26:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'dba816f4bfea45c6a9189cd10e2c2ff6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



In [40]:
grid_search.best_params_

{'depth': 8, 'l2_leaf_reg': 3, 'learning_rate': 0.001}

In [39]:
grid_search.best_score_

0.46

In [43]:
model_1 = CatBoostClassifier(**grid_search.best_params_)
model_1.fit(x_train, y_train, cat_features=cat, verbose=False)
pred = model_1.predict(x_val)
accuracy_score(y_val, pred)

0.5533333333333333