In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, r2_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
df_first240 = pd.read_csv("../data/first_240.csv")
df_from_240_to_360 = pd.read_csv("../data/from_240_to_360.csv")
df_from_360_to_480 = pd.read_csv("../data/from_360_480.csv")

In [3]:
df_moves = pd.concat([df_first240, df_from_240_to_360, df_from_360_to_480])
GAME_COUNT = 480

In [4]:
df_games = pd.read_csv("../data/clear_data.csv")
df_games = df_games.head(GAME_COUNT)
df_games['game_id'].value_counts()

Для начала попробуем угадывать рейтинг только белого игрока. С черным должно быть то же самое

In [5]:
def percent_best_move(game_id, ost=0):
    # ost = 0 если смотрим ходы белых, иначе ost = 1
    move_in_game = df_moves[(df_moves['game_id'] == game_id) & (df_moves['move_number'] % 2 == ost)]
    matches = move_in_game[
        (move_in_game['move'] == move_in_game['best_line_1_move']) |
        (move_in_game['move'] == move_in_game['best_line_2_move']) |
        (move_in_game['move'] == move_in_game['best_line_3_move'])
        ]
    return matches.shape[0] / move_in_game.shape[0]

In [6]:
from rating_to_category import rating_to_number_easy

df_games['white_percent_best_move'] = df_games['game_id'].apply(percent_best_move)
df_games['white_rating_num'] = df_games['white_elo'].apply(rating_to_number_easy)
df_games

In [7]:
df_games['white_rating_num'].value_counts()

In [8]:
df_train = df_games[['Events', 'results', 'termination', 'white_percent_best_move']]
df_train = pd.get_dummies(df_train, columns=['termination', 'Events', 'results'])
df_train

В качестве признаков возьмем процент ходов, которые мы считаем наилучшими

In [9]:
x_train_white, x_test_white, y_train_white, y_test_white = train_test_split(
    df_train, df_games['white_rating_num'],
    train_size=0.8,
    random_state=42)
x_train_white.shape, x_test_white.shape, y_train_white.shape, y_test_white.shape

In [10]:
from catboost import CatBoostClassifier


def get_fit_predict_res(x_train, x_test, y_train, y_test):
    models = {
        "LogisticRegression": LogisticRegression(),
        "RandomForestClassifier": RandomForestClassifier(),
        "Catboost": CatBoostClassifier()
    }
    results = []
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred_test = model.predict(x_test)
        y_pred_train = model.predict(x_train)
        results.append({
            "model": name,
            "f1_score test": f1_score(y_true=y_test, y_pred=y_pred_test, average='micro'),
            "r2_score test": r2_score(y_true=y_test, y_pred=y_pred_test),
            "balanced_accuracy test": balanced_accuracy_score(y_true=y_test, y_pred=y_pred_test),
            "f1_score train": f1_score(y_true=y_train, y_pred=y_pred_train, average='micro'),
            "r2_score train": r2_score(y_true=y_train, y_pred=y_pred_train),
            "balanced_accuracy train": balanced_accuracy_score(y_true=y_train, y_pred=y_pred_train),
        })
    return pd.DataFrame(results)


get_fit_predict_res(x_train_white, x_test_white, y_train_white, y_test_white)

Результаты не впечатляют, давайте попробуем убрать все игры в пулю, т.к. в предположении люди почти не думают, прежде чем там сходить

In [11]:
df_games_without_bullet = df_games[df_games['Events'] != 'Rated Bullet game']
df_games_without_bullet['Events'].value_counts()

In [12]:
df_train = df_games_without_bullet[['Events', 'results', 'termination', 'white_percent_best_move']]
df_train = pd.get_dummies(df_train, columns=['termination', 'Events', 'results'])
x_train_white, x_test_white, y_train_white, y_test_white = train_test_split(
    df_train, df_games_without_bullet['white_rating_num'],
    train_size=0.8,
    random_state=42)
x_train_white.shape, x_test_white.shape, y_train_white.shape, y_test_white.shape

In [13]:
get_fit_predict_res(x_train_white, x_test_white, y_train_white, y_test_white)

Возможно дело в том, что только 1 признак. Давайте добавим в наша датафрейм ещё признаков, посмотрим что из этого выйдет

In [14]:
def min_max_delta_centipawns(game_id, ost=0, want=0):
    # ost = 0, если смотрим на ходы белых
    # want = 0, если интересуемся минимумом, want = 1, если нужен максимум и want = 2, если медиана
    last = 0
    _id = 0
    min_max_median_delta = (1000, -1000, 0)
    for centipawns in df_moves[df_moves['game_id'] == game_id].sort_values(by='move_number')['centipawns']:
        if _id % 2 == ost and not np.isnan(centipawns):
            min_max_median_delta = (
                min(min_max_median_delta[0], centipawns - last), max(min_max_median_delta[1], centipawns - last),
                min_max_median_delta[2] + centipawns)
        last = centipawns
        _id += 1
    min_max_median_delta = (
        min_max_median_delta[0] / 1000, min_max_median_delta[1] / 1000, (min_max_median_delta[2] / _id) / 1000)
    if want == 0:
        return min_max_median_delta[0]
    elif want == 1:
        return min_max_median_delta[1]
    return min_max_median_delta[2]


def min_centipawns_white(game_id):
    return min_max_delta_centipawns(game_id, ost=0, want=0)


def max_centipawns_white(game_id):
    return min_max_delta_centipawns(game_id, ost=0, want=1)


def median_centipawns_white(game_id):
    return min_max_delta_centipawns(game_id, ost=0, want=2)


def min_centipawns_black(game_id):
    return min_max_delta_centipawns(game_id, ost=1, want=0)


def max_centipawns_black(game_id):
    return min_max_delta_centipawns(game_id, ost=1, want=1)


def median_centipawns_black(game_id):
    return min_max_delta_centipawns(game_id, ost=1, want=2)


df_games['min_delta_centipawns_white'] = df_games['game_id'].apply(min_centipawns_white)
df_games['max_delta_centipawns_white'] = df_games['game_id'].apply(max_centipawns_white)
df_games['median_centipawns_white'] = df_games['game_id'].apply(median_centipawns_white)
df_games.sort_values(by='white_elo')

In [15]:
plt.scatter(df_games['white_elo'], df_games['max_delta_centipawns_white'])
plt.show()
plt.scatter(df_games['white_elo'], df_games['median_centipawns_white'])
plt.show()
plt.scatter(df_games['white_elo'], df_games['min_delta_centipawns_white'])
plt.show()
plt.scatter(df_games['white_elo'], df_games['white_percent_best_move'])
plt.show()

Попробуем обучиться на этих признаках

In [16]:
df_games

In [17]:
df_train = df_games[['white_percent_best_move', 'min_delta_centipawns_white', 'max_delta_centipawns_white', 'median_centipawns_white']]
x_train_white, x_test_white, y_train_white, y_test_white = train_test_split(
    df_train, df_games['white_rating_num'],
    train_size=0.8,
    random_state=42)
x_train_white.shape, x_test_white.shape, y_train_white.shape, y_test_white.shape

In [18]:
get_fit_predict_res(x_train_white, x_test_white, y_train_white, y_test_white)