In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import f1_score, r2_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/lichess_db_standard_rated_2013-01.csv")
df.dtypes

Events               object
results              object
white_elo             int64
black_elo             int64
white_rating_diff     int64
black_rating_diff     int64
ecos                 object
openings             object
time_control         object
termination          object
all_moves            object
dtype: object

Шахматные дебюты (они же ECO), кроются в самих ходах. В таком случае модели про них знать не обязательно (иначе зависимости в признаках). Это также касается опенингов

In [3]:
df['termination'].value_counts()

termination
Normal          83273
Time forfeit    37841
Name: count, dtype: int64

Конец игры может быть двух видов: закончилось время или игра  

In [4]:
df['results'].value_counts()

results
1-0        62009
0-1        55139
1/2-1/2     3966
Name: count, dtype: int64

Результаты бывают 3ех видов: победа белых, победа черных и ничья

In [5]:
df['Events'].value_counts()

Events
Rated Blitz game                                                      45353
Rated Classical game                                                  41679
Rated Bullet game                                                     32687
Rated Correspondence game                                               180
Rated Blitz tournament https://lichess.org/tournament/eaxo4pkh           18
                                                                      ...  
Rated Classical tournament https://lichess.org/tournament/dshcjgey        2
Rated Blitz tournament https://lichess.org/tournament/dbu4m3u3            2
Rated Blitz tournament https://lichess.org/tournament/uitsh1pw            2
Rated Blitz tournament https://lichess.org/tournament/lpotikyv            1
Rated Bullet tournament https://lichess.org/tournament/k4se9ydi           1
Name: count, Length: 151, dtype: int64

На самом деле поле time_control тоже зависит от типа события. Пока упростим задачу и будем просто угадывать для Blitz, Classical и Bullet партий рейтинги (турниры пока опустим)

In [6]:
df.head()

Unnamed: 0,Events,results,white_elo,black_elo,white_rating_diff,black_rating_diff,ecos,openings,time_control,termination,all_moves
0,Rated Classical game,1-0,1639,1403,5,-8,C00,French Defense: Normal Variation,600+8,Normal,"['e2e4', 'e7e6', 'd2d4', 'b7b6', 'a2a3', 'c8b7..."
1,Rated Classical game,1-0,1654,1919,19,-22,D04,"Queen's Pawn Game: Colle System, Anti-Colle",480+2,Normal,"['d2d4', 'd7d5', 'g1f3', 'g8f6', 'e2e3', 'c8f5..."
2,Rated Classical game,1-0,1643,1747,13,-94,C50,Four Knights Game: Italian Variation,420+17,Normal,"['e2e4', 'e7e5', 'g1f3', 'b8c6', 'f1c4', 'g8f6..."
3,Rated Bullet game,0-1,1824,1973,-6,8,B12,Caro-Kann Defense: Goldman Variation,60+1,Normal,"['e2e4', 'c7c6', 'b1c3', 'd7d5', 'd1f3', 'd5e4..."
4,Rated Bullet game,0-1,1765,1815,-9,9,C00,French Defense: La Bourdonnais Variation,60+1,Normal,"['e2e4', 'e7e6', 'f2f4', 'd7d5', 'e4e5', 'c7c5..."


Итого хотим, зная последовательность ходов, тип игры, исходные рейтинги игроков угадывать рейтинги. Подготовим нашу дату.

У каждого хода есть точка старта и точка прихода. Давайте напишем функцию, которая по позиции на доске вернет уникальное число

Также напишем функцию, которая по точке старта и финиша (2 числа) вернет уникальное число конкретного хода

In [7]:
def get_cell_index(pos: str) -> int:  # return value in [0, 63]
    return 8 * (ord(pos[0]) - ord('a')) + (ord(pos[1]) - ord('1'))


def get_move_index(move: str) -> int:
    return 64 * get_cell_index(move[0] + move[1]) + get_cell_index(move[2] + move[3])


def change_all_moves(all_moves):
    res = []
    splt = all_moves.split("'")
    for i in range(1, len(splt), 2):
        res.append(get_move_index(splt[i]))
    return res

In [8]:
df['moves_indexes'] = df['all_moves'].apply(change_all_moves)

In [9]:
df.head()

Unnamed: 0,Events,results,white_elo,black_elo,white_rating_diff,black_rating_diff,ecos,openings,time_control,termination,all_moves,moves_indexes
0,Rated Classical game,1-0,1639,1403,5,-8,C00,French Defense: Normal Variation,600+8,Normal,"['e2e4', 'e7e6', 'd2d4', 'b7b6', 'a2a3', 'c8b7...","[2147, 2469, 1627, 909, 66, 1486, 530, 3581, 1..."
1,Rated Classical game,1-0,1654,1919,19,-22,D04,"Queen's Pawn Game: Colle System, Anti-Colle",480+2,Normal,"['d2d4', 'd7d5', 'g1f3', 'g8f6', 'e2e3', 'c8f5...","[1627, 1948, 3114, 3565, 2146, 1516, 2747, 286..."
2,Rated Classical game,1-0,1643,1747,13,-94,C50,Four Knights Game: Italian Variation,420+17,Normal,"['e2e4', 'e7e5', 'g1f3', 'b8c6', 'f1c4', 'g8f6...","[2147, 2468, 3114, 981, 2579, 3565, 530, 3028,..."
3,Rated Bullet game,0-1,1824,1973,-6,8,B12,Caro-Kann Defense: Goldman Variation,60+1,Normal,"['e2e4', 'c7c6', 'b1c3', 'd7d5', 'd1f3', 'd5e4...","[2147, 1429, 530, 1948, 1578, 1827, 1187, 990,..."
4,Rated Bullet game,0-1,1765,1815,-9,9,C00,French Defense: La Bourdonnais Variation,60+1,Normal,"['e2e4', 'e7e6', 'f2f4', 'd7d5', 'e4e5', 'c7c5...","[2147, 2469, 2667, 1948, 2276, 1428, 3114, 199..."


Оставим только интересующие нас поля, пофильтруем их

In [10]:
df_clear = df[['Events', 'results', 'white_elo', 'black_elo', 'white_rating_diff', 'black_rating_diff', 'termination',
               'moves_indexes']]

In [11]:
df_clear.head()

Unnamed: 0,Events,results,white_elo,black_elo,white_rating_diff,black_rating_diff,termination,moves_indexes
0,Rated Classical game,1-0,1639,1403,5,-8,Normal,"[2147, 2469, 1627, 909, 66, 1486, 530, 3581, 1..."
1,Rated Classical game,1-0,1654,1919,19,-22,Normal,"[1627, 1948, 3114, 3565, 2146, 1516, 2747, 286..."
2,Rated Classical game,1-0,1643,1747,13,-94,Normal,"[2147, 2468, 3114, 981, 2579, 3565, 530, 3028,..."
3,Rated Bullet game,0-1,1824,1973,-6,8,Normal,"[2147, 1429, 530, 1948, 1578, 1827, 1187, 990,..."
4,Rated Bullet game,0-1,1765,1815,-9,9,Normal,"[2147, 2469, 2667, 1948, 2276, 1428, 3114, 199..."


In [12]:
df_clear = df_clear.loc[((df_clear['Events'] == 'Rated Classical game') | (
        df_clear['Events'] == 'Rated Bullet game') | (df_clear['Events'] == 'Rated Blitz game')) & (
                                abs(df_clear['white_rating_diff']) <= 10) & (
                                abs(df_clear['black_rating_diff']) <= 10)]

In [13]:
from rating_to_category import rating_to_number

df_test = df_clear[['Events', 'results', 'termination', 'moves_indexes']]
df_predict_white = df_clear['white_elo'].apply(rating_to_number)
df_predict_black = df_clear['black_elo'].apply(rating_to_number)

In [14]:
df_test.shape, df_predict_white.shape, df_predict_black.shape

((49531, 4), (49531,), (49531,))

In [15]:
encoded = pd.get_dummies(df_test, columns=['termination', 'Events', 'results'])

In [16]:
encoded.head()

Unnamed: 0,moves_indexes,termination_Normal,termination_Time forfeit,Events_Rated Blitz game,Events_Rated Bullet game,Events_Rated Classical game,results_0-1,results_1-0,results_1/2-1/2
0,"[2147, 2469, 1627, 909, 66, 1486, 530, 3581, 1...",True,False,False,False,True,False,True,False
3,"[2147, 1429, 530, 1948, 1578, 1827, 1187, 990,...",True,False,False,True,False,True,False,False
4,"[2147, 2469, 2667, 1948, 2276, 1428, 3114, 199...",True,False,False,True,False,True,False,False
7,"[2147, 2468, 1107, 3028, 3114, 1949, 1626, 389...",True,False,False,False,True,False,True,False
14,"[2147, 1428, 2667, 1948, 2268, 2012, 530, 1823...",True,False,True,False,False,False,False,True


Ходы мы не можем оставить как массив чисел. Давайте условимся, что в партии не может быть больше 300 ходов (в нашем датасете это число 360). Если ходов будет больше - просто обросим их. Напишем функцию, которая по ходам вернет новый датафрейм, который мы соеденим с нашим исходным

In [17]:
def get_new_df(moves):
    MAX_MOVE_COUNT = 300
    res = [[] for _ in range(MAX_MOVE_COUNT)]
    for my_moves in moves:
        for i in range(min(MAX_MOVE_COUNT, len(my_moves))):
            res[i].append(my_moves[i])
        for i in range(min(MAX_MOVE_COUNT, len(my_moves)), MAX_MOVE_COUNT):
            res[i].append(0)
    my_dict = {'move ' + str(i): res[i] for i in range(MAX_MOVE_COUNT)}
    return pd.DataFrame(my_dict)

In [18]:
res = get_new_df(encoded['moves_indexes'])
res.shape, encoded.shape

((49531, 300), (49531, 9))

Теперь соединим наши датафреймы и уберем столбец moves_indexes (т.к. все ходы уже записаны в отдельных признаках)

In [19]:
total_df = pd.concat([encoded.reset_index(drop=True), res.reset_index(drop=True)], axis=1)
del total_df['moves_indexes']
total_df.head()

Unnamed: 0,termination_Normal,termination_Time forfeit,Events_Rated Blitz game,Events_Rated Bullet game,Events_Rated Classical game,results_0-1,results_1-0,results_1/2-1/2,move 0,move 1,...,move 290,move 291,move 292,move 293,move 294,move 295,move 296,move 297,move 298,move 299
0,True,False,False,False,True,False,True,False,2147,2469,...,0,0,0,0,0,0,0,0,0,0
1,True,False,False,True,False,True,False,False,2147,1429,...,0,0,0,0,0,0,0,0,0,0
2,True,False,False,True,False,True,False,False,2147,2469,...,0,0,0,0,0,0,0,0,0,0
3,True,False,False,False,True,False,True,False,2147,2468,...,0,0,0,0,0,0,0,0,0,0
4,True,False,True,False,False,False,False,True,2147,1428,...,0,0,0,0,0,0,0,0,0,0


In [20]:
x_train_white, x_test_white, y_train_white, y_test_white = train_test_split(total_df, df_predict_white, train_size=0.8,
                                                                            random_state=42)
x_train_white.shape, x_test_white.shape, y_train_white.shape, y_test_white.shape

((39624, 308), (9907, 308), (39624,), (9907,))

In [21]:
y_train_white.unique(), y_test_white.unique()

(array([1, 2, 3, 5, 0, 4, 6, 7]), array([5, 2, 3, 1, 4, 0, 6]))

Обучим модель для предсказания только рейтинга белого. Рейтинг черного не должен сильно отличаться

In [22]:
models = {
    "LogisticRegression": LogisticRegression(),
    "RandomForestClassifier": RandomForestClassifier()
}
results = []
for name, model in models.items():
    model.fit(x_train_white, y_train_white)
    y_pred_test = model.predict(x_test_white)
    y_pred_train = model.predict(x_train_white)
    results.append({
        "model": name,
        "f1_score test": f1_score(y_true=y_test_white, y_pred=y_pred_test, average='micro'),
        "r2_score test": r2_score(y_true=y_test_white, y_pred=y_pred_test),
        "balanced_accuracy test": balanced_accuracy_score(y_true=y_test_white, y_pred=y_pred_test),
        "f1_score train": f1_score(y_true=y_train_white, y_pred=y_pred_train, average='micro'),
        "r2_score train": r2_score(y_true=y_train_white, y_pred=y_pred_train),
        "balanced_accuracy train": balanced_accuracy_score(y_true=y_train_white, y_pred=y_pred_train),
    })
total_res = pd.DataFrame(results)
total_res

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,model,f1_score test,r2_score test,balanced_accuracy test,f1_score train,r2_score train,balanced_accuracy train
0,LogisticRegression,0.342081,-0.102952,0.147782,0.358343,-0.094928,0.13521
1,RandomForestClassifier,0.412133,0.181512,0.199499,0.999117,0.998421,0.99901
