# Imports

In [1]:
import pandas as pd
import random
import math
import sklearn

# Initial Loading of Data

In [2]:
raw_df = pd.read_csv(r'C:\Users\joesp\Documents\Projects\lichess\data\mass_evaled_games_rh.csv')
raw_df = raw_df[raw_df['Winner'] != .5]
raw_df = raw_df.dropna().reset_index(drop=True)

### Verify that Clock Times and Evals Match

In [3]:
for i in range(len(raw_df)):
    raw_df.at[i, 'Clocks'] = eval(raw_df.at[i, 'Clocks'])
    raw_df.at[i, 'Evals'] = eval(raw_df.at[i, 'Evals'])
    
raw_df = raw_df[raw_df['Clocks'].str.len() == raw_df['Evals'].str.len()].reset_index(drop=True)

### Split Into Test and Train Data Set

In [4]:
from sklearn.model_selection import train_test_split

def create_test_train(df):
    train, test = train_test_split(df, test_size=.2)

    train.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)
    
    return train, test

In [5]:
train_df, test_df = create_test_train(raw_df)
datasets = [train_df, test_df]

### Expand Data on Clock and Eval

In [6]:
num_rows_per_game = 5

def expand_games_clock_evals(df, num_rows_per_game):
    columns = df.columns.append(pd.Index(['White_Clock', 'Black_Clock', 'Eval'])).drop(pd.Index(['Clocks', 'Evals']))
    ret_df = []
    
    for i in range(len(df)):
        clocks = df.at[i, 'Clocks']
        evals = df.at[i, 'Evals']
        
        white_clocks = [clocks[k] for k in range(len(clocks)) if k%2 == 0]
        black_clocks = [clocks[k] for k in range(len(clocks)) if k%2 == 1]
        
        num_clocks = len(black_clocks)
        num_rows = min(num_rows_per_game, num_clocks)
        
        indexes = random.choices([k for k in range(len(evals))], k=num_rows)
        
        for index in indexes:
            if index == len(evals)-1:
                ret_df.append(list(df.iloc[i])[:-2] + [white_clocks[-1]] + [black_clocks[-1]] + [evals[index]])
            else:
                ret_df.append(list(df.iloc[i])[:-2] + [white_clocks[math.ceil(index/2)]] + [black_clocks[math.floor(index/2)]] + [evals[index]])
    
    ret_df = pd.DataFrame(ret_df, columns=list(columns))
    
    ret_df['Is_Mate'] = False
    ret_df.loc[ret_df['Eval'].str.contains('#'), 'Is_Mate'] = True
    ret_df['Eval'] = ret_df['Eval'].str.replace('#', '')

    return ret_df

def shuffle(df):
    return df.sample(frac=1).reset_index(drop=True)

train_df = expand_games_clock_evals(train_df, num_rows_per_game)
train_df = shuffle(train_df)
test_df = expand_games_clock_evals(test_df, num_rows_per_game)
test_df = shuffle(test_df)

### One-Hot Encoding of Perf

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(train_df[['Perf']])

new_columns = [f'Perf_{category}' for category in encoder.categories_[0]]
for column in new_columns:
    train_df[column] = 0
    test_df[column] = 0

train_df[new_columns] = encoder.transform(train_df[['Perf']]).toarray()
test_df[new_columns] = encoder.transform(test_df[['Perf']]).toarray()

train_df.drop(columns=['Perf'], inplace=True)
test_df.drop(columns=['Perf'], inplace=True)

train_df = train_df.astype({"Winner": int, "Eval":float})
test_df = test_df.astype({"Winner": int, "Eval":float})

### Final Cleaning

In [8]:
train_df.drop(columns=['White_Username', 'Black_Username'], inplace=True)
test_df.drop(columns=['White_Username', 'Black_Username'], inplace=True)

In [9]:
def create_labels_data(df, label_column):
    data = df[[column for column in df if column != label_column]]    
    labels = df[label_column]
    
    return data, labels

In [10]:
X_train, y_train = create_labels_data(train_df, 'Winner')
X_test, y_test = create_labels_data(test_df, 'Winner')

### Rearrange Columns for Understandability

In [11]:
column_order = ['Perf_Blitz', 'Perf_Bullet', 'Perf_Classical', 'Perf_Rapid', 
        'Perf_UltraBullet', 'White_Rating', 'White_UltraBullet_Rating', 
        'White_Bullet_Rating', 'White_Blitz_Rating', 'White_Rapid_Rating',
       'White_Classical_Rating', 'White_Correspondence_Rating', 'Black_Rating',
        'Black_UltraBullet_Rating', 'Black_Bullet_Rating', 'Black_Blitz_Rating',
       'Black_Rapid_Rating', 'Black_Classical_Rating', 'Black_Correspondence_Rating',
       'Clock_Initial', 'Clock_Increment', 'White_Clock', 'Black_Clock', 'Eval', 'Is_Mate']

X_train = X_train[column_order]
X_test = X_test[column_order]

# Initial Modeling of Data

### Create Baseline Models

In [12]:
prob_baseline_raw = pd.read_csv(r'C:\Users\joesp\Documents\Projects\lichess\data\baseline-june.csv')

def columns_group(df, game_speed, round_num, number):
    ret = [game_speed, number]
    rows = df.loc[(round_to(df['Rating Difference'], round_num) == number) & (df["Game Speed"] == game_speed)]
    ret += [rows['Total Points'].sum(), rows['Total Games'].sum(), rows['Total Draws'].sum()]
    return ret

def round_to(x, to):
    return to * round(x/to)

prob_baseline = pd.DataFrame(columns=prob_baseline_raw.columns)

for gs in prob_baseline_raw['Game Speed'].unique():
    maximum = round_to(max(prob_baseline_raw[prob_baseline_raw['Game Speed'] == gs]['Rating Difference']), 100)
    minimum = round_to(min(prob_baseline_raw[prob_baseline_raw['Game Speed'] == gs]['Rating Difference']), 100)
    for num in range(minimum, maximum, 50):
        prob_baseline.loc[len(prob_baseline)] = columns_group(prob_baseline_raw, gs, 50, num)
        
prob_baseline = prob_baseline[prob_baseline['Total Games'] != 0]
prob_baseline['Winning_Percentage'] = prob_baseline['Total Points']/prob_baseline['Total Games']
prob_baseline = prob_baseline.drop(columns=['Total Points', 'Total Games', 'Total Draws'])

In [13]:
def baseline_rating_score(df, label):
    right = 0
    wrong = 0
    for index, row in df.iterrows():
        winner = label.iloc[index]
        if (row['White_Rating']>row['Black_Rating'] and winner==0) or (row['White_Rating']<row['Black_Rating'] and winner==1):
            right += 1
        else:
            wrong += 1
    return (right/(right+wrong))

def baseline_eval_score(df, label):
    right = 0
    wrong = 0
    for index, row in df.iterrows():
        winner = label.iloc[index]
        if (row['Eval']>0 and winner==0) or (row['Eval']<0 and winner==1):
            right += 1
        else:
            wrong += 1
    return (right/(right+wrong))

### First Modeling

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

In [15]:
def rmse_rf(rf, data, labels):
    y_hat = rf.predict_proba(data)
    y_hat = [prob[1] for prob in y_hat]
    return math.sqrt(mean_squared_error(y_hat, labels))

def rmse_baseline(data, labels, prob_baseline):
    y_hat = []
    
    for index, row in data.iterrows():
        y_hat.append(1-float(prob_baseline[(prob_baseline['Game Speed'] == 'Bullet') & (prob_baseline['Rating Difference'] == round_to(row['White_Rating']-row['Black_Rating'], 50))]['Winning_Percentage']))
    
    return math.sqrt(mean_squared_error(y_hat, labels))

In [19]:
rf = RandomForestClassifier(n_estimators=500, max_features="log2", min_samples_leaf=5, n_jobs=1)
%time rf.fit(X_train, y_train)

Wall time: 24.4 s


RandomForestClassifier(max_features='log2', min_samples_leaf=5,
                       n_estimators=500, n_jobs=1)

### Comparison to Baseline

In [20]:
print(f"Accuracy Scores")
print(f"Random Forest Accuracy: {rf.score(X_test, y_test)}")
print(f"Baseline Rating: {baseline_rating_score(X_test, y_test)}")
print(f"Baseline Eval: {baseline_eval_score(X_test, y_test)}")

print(f"\nLoss Scores")
print(f"Random Forest RMSE: {rmse_rf(rf, X_test, y_test)}")
print(f"Baseline RMSE: {rmse_baseline(X_test, y_test, prob_baseline)}")

Accuracy Scores
Random Forest Accuracy: 0.7242023928215354
Baseline Rating: 0.55259222333001
Baseline Eval: 0.638085742771685

Loss Scores
Random Forest RMSE: 0.4290871588832472
Baseline RMSE: 0.4937820615143324


### Save Model

In [21]:
from joblib import dump, load
dump(rf, r'C:\Users\joesp\Documents\Projects\lichess\models\rf.joblib') 

['C:\\Users\\joesp\\Documents\\Projects\\lichess\\models\\rf.joblib']