# Load Data

In [None]:
import pandas as pd
from sklearn import preprocessing 
hockey = pd.read_csv('hockey.csv').drop('team1', axis=1)

football = pd.read_csv('football.csv').dropna().drop(['winner', 'team1'], axis=1)

In [None]:
hockey.to_csv('hockey.csv', index=False)
football.to_csv("football.csv", index=False)

In [None]:
football.score1 = football.score1.astype(int)
football.score2 = football.score2.astype(int)

In [None]:
hockey.head(1)

In [None]:
football.head(1)

**Генерация фичей**

In [None]:
def get_season(month):
    if month in [12, 1, 2]:
        return 0
    elif month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    else:
        return 3

def time_of_day(hour):
    if 0 <= hour < 6:
        return 0
    elif 6 <= hour < 12:
        return 1
    elif 12 <= hour < 18:
        return 2
    else:
        return 3

def make_datetime_features(df):

    label_encoder = preprocessing.LabelEncoder() 
    # дата фичи
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    df['day_of_month'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week
    df['season'] = df['month'].apply(get_season)
    df['is_month_start_end'] =( (df['date'].dt.is_month_start)| (df['date'].dt.is_month_end)).astype(int)
    
    #временные фичи
    df['time'] = pd.to_datetime(df['time']).dt.time
    df['hour'] = df['time'].apply(lambda x: x.hour)
    df['minute'] = df['time'].apply(lambda x: x.minute) 
    df['time_of_day'] = df['hour'].apply(time_of_day)

    df['is_work_time'] = df['hour'].between(9, 18, inclusive='left').astype(int)
    df['decimal_time'] = df['hour'] + df['minute'] / 60.0
    df['is_afternoon'] = (df['hour'] >= 12).astype(int)

    df['days_until_weekend'] = (5 - df['day_of_week']) % 7
    df['days_until_end_of_month'] = df['date'].dt.days_in_month - df['day_of_month']

    df['scorediff'] = df['score1'] - df['score2']
    df['team2']= label_encoder.fit_transform(df['team2']) 
    df = df.drop(['team1'], axis=1)
    np.save('data/classes.npy', label_encoder.classes_)
    return df

def make_team_features(df):
    df['avg_scorediff_by_team'] = df.groupby('team2')['scorediff'].transform('mean')
    df['games_played_with_team'] = df.groupby('team2')['team2'].transform('count')
    df['cumulative_scorediff_with_team'] = df.groupby('team2')['scorediff'].cumsum()
    df['win_ratio_with_team'] = df.groupby('team2')['scorediff'].transform(lambda x: (x > 0).sum() / len(x))
    df['avg_scorediff_by_season'] = df.groupby('season')['scorediff'].transform('mean')
    return df


ХОККЕЙ

In [None]:
hockey = make_datetime_features(hockey)
hockey = make_team_features(hockey)
hockey.head(1)

In [None]:
football = make_datetime_features(football)
football = make_team_features(football)
football.head(1)

In [None]:
hockey.to_csv("data/hockey.csv", index=False)
football.to_csv("data/football.csv", index=False)

# Обучение модели

In [None]:
from catboost import CatBoostRegressor,CatBoostClassifier
def retrain_model(sport_name):
    if sport_name == 'hockey':
        df = pd.read_csv(r'data/hockey.csv')
    else:
        df = pd.read_csv(r'data/football.csv')
    df = df.sample(frac=1)
    cols =  ['team2', 'year', 'day_of_week', 'is_weekend', 'day_of_month', 'month', 'quarter', 'day_of_year', 'week_of_year', 
        'season', 'is_month_start_end', 'hour', 'minute', 'time_of_day', 'is_work_time', 'decimal_time', 'is_afternoon',
          'days_until_weekend', 'days_until_end_of_month', 'scorediff', 'avg_scorediff_by_team', 'games_played_with_team', 
          'cumulative_scorediff_with_team', 'win_ratio_with_team', 'avg_scorediff_by_season']
    df = df[cols]
    X, y = df.drop('scorediff', axis=1), df['scorediff']
    reg_model = CatBoostRegressor(eval_metric='MAE')
    reg_model.fit(X, y, verbose=False)
    reg_model.save_model(f'models/{sport_name}/{sport_name}_regressor')

    classif_model = CatBoostClassifier(eval_metric='TotalF1')
    classif_model.fit(X, y, verbose=False)
    classif_model.save_model(f'models/{sport_name}/{sport_name}_classifier')

    return True

In [None]:
retrain_model('hockey')
retrain_model('football')

# Проверить точность (допустим на футболе)

Regression

In [None]:
from sklearn.model_selection import train_test_split
cols = ['team2', 'year', 'day_of_week', 'is_weekend', 'day_of_month', 'month', 'quarter', 'day_of_year', 'week_of_year', 
        'season', 'is_month_start_end', 'hour', 'minute', 'time_of_day', 'is_work_time', 'decimal_time', 'is_afternoon',
          'days_until_weekend', 'days_until_end_of_month', 'scorediff', 'avg_scorediff_by_team', 'games_played_with_team', 
          'cumulative_scorediff_with_team', 'win_ratio_with_team', 'avg_scorediff_by_season']
football = football[cols]
X, y = football.drop('scorediff', axis=1), football['scorediff']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=56)

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(eval_metric='MAE')
model.fit(X_train, y_train, eval_set=(X_test, y_test))

Classification

In [None]:
ynew = []
for i in y:
    if i > 0:
        ynew.append(2)
    elif i == 0:
        ynew.append(1)
    else:
        ynew.append(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, ynew, test_size=0.2, random_state=56)

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(eval_metric='Accuracy')
model.fit(X_train, y_train, eval_set=(X_test, y_test))

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
def predict_match(sport_name, row):
    if sport_name == 'hockey':
        path = f'models/hockey/{sport_name}'
    else:
        path = f'models/football/{sport_name}'
    regressor = CatBoostRegressor()
    regressor.load_model(path+"_regressor", format='cbm')
    classifier = CatBoostClassifier()
    classifier.load_model(path+"_classifier", format='cbm')

predict_match('hockey', 'display')

In [None]:
from newutils import MatchPredictor
a = MatchPredictor()

In [None]:
a.predict_match('hockey', '27.11.2024 18:20 Бончевскиеdasda Тигры')

In [None]:
a.save_all_plots('hockey', save=True)

In [None]:
a.save_all_plots('football', save=True)

In [None]:
import pandas as pd

train = pd.read_csv(f'data/hockey.csv')
df = pd.DataFrame({'team2', 'Бончевские Тигры'})
if len(train[train.team2 == 'Бончевские Тигры']) > 0:
    train = train[train.team2 == 'Бончевские Тигры']

    df['avg_scorediff_by_team'] = train.groupby('team2')['scorediff'].mean()
    df['games_played_with_team'] = train.groupby('team2')['team2'].count()
    df['cumulative_scorediff_with_team'] = train.groupby('team2')['scorediff'].cumsum().mean()
    df['win_ratio_with_team'] = (train['scorediff'] > 0).sum() / len(train)
    df['avg_scorediff_by_season'] = train.groupby('season')['scorediff'].mean()
else:
    df['avg_scorediff_by_team'] = 0
    df['games_played_with_team'] = 0
    df['cumulative_scorediff_with_team'] = 0
    df['win_ratio_with_team'] = -1
    df['avg_scorediff_by_season'] = 0

In [None]:
train.groupby('team2')['scorediff'].mean()[0]