# Imports

In [20]:
import pandas as pd
import random
import math
import numpy as np
from sklearn.base import TransformerMixin

# Initial Loading of Data

In [21]:
raw_data = pd.read_csv(r'..\data\data.csv')
data = raw_data[raw_data['winner'] != .5]
data = data.dropna().reset_index(drop=True)

### Verify that Clock Times and Evals Match

In [22]:
for i in range(len(data)):
    data.at[i, 'clocks'] = eval(data.at[i, 'clocks'])
    data.at[i, 'evals'] = eval(data.at[i, 'evals'])
    
data = data[data['clocks'].str.len() == data['evals'].str.len()].reset_index(drop=True)

### Correct Rating Difference

In [23]:
for index, row in data.iterrows():
    data.at[index, 'white_' + row['perf'] + '_rating'] = row['white_rating']
    data.at[index, 'black_' + row['perf'] + '_rating'] = row['black_rating']

### Create Train/Test Set

In [24]:
from sklearn.model_selection import train_test_split

def create_test_train(df):
    train, test = train_test_split(df, test_size=.2)

    train.reset_index(inplace=True, drop=True)
    test.reset_index(inplace=True, drop=True)
    
    return train, test

train_data, test_data = create_test_train(data)

### Expand on Evals/Clocks

In [25]:
class EvalClockExpander(TransformerMixin):
    def __init__(self, num_rows_per_game=1):
        self.num_rows_per_game = num_rows_per_game

    def transform(self, df):
        X = df.copy()

        for i in range(self.num_rows_per_game-1):
            X = pd.concat((X, df.copy()), axis=0)
            
        X = X.sample(frac=1).reset_index(drop=True)
            
        X['white_clock'] = 0
        X['black_clock'] = 0
        X['eval'] = ''
        X['is_mate'] = False

        for i, row in X.iterrows():
            index = random.randint(0, len(row['clocks']))
            
            if index == 0:
                X.at[i, 'white_clock'] = row['clocks'][0]
                X.at[i, 'black_clock'] = row['clocks'][1 if 1 < len(row['clocks']) else 0]
                if '#' in row['evals'][0]:
                    X.at[i, 'eval'] = row['evals'][0].replace("#", "")
                    X.at[i, 'is_mate'] = True
                else:
                    X.at[i, 'eval'] = row['evals'][0]
            elif index == len(row['clocks']):
                X.at[i, 'white_clock'] = row['clocks'][-1 if len(row['clocks'])%2==0 else -2]
                X.at[i, 'black_clock'] = row['clocks'][-1 if len(row['clocks'])%2==1 else -2]
                if '#' in row['evals'][-1]:
                    X.at[i, 'eval'] = row['evals'][-1].replace("#", "")
                    X.at[i, 'is_mate'] = True
                else:
                    X.at[i, 'eval'] = row['evals'][-1]
            else:
                X.at[i, 'white_clock'] = row['clocks'][index if len(row['clocks'])%2==0 else index-1]
                X.at[i, 'black_clock'] = row['clocks'][index if len(row['clocks'])%2==1 else index-1]
                X.at[i, 'eval'] = row['evals'][index]
                if '#' in row['evals'][index]:
                    X.at[i, 'eval'] = row['evals'][index].replace("#", "")
                    X.at[i, 'is_mate'] = True
                else:
                    X.at[i, 'eval'] = row['evals'][index]

        X.drop(columns=['clocks', 'evals'], inplace=True)
        
        X.reset_index(drop=True, inplace=True)    
            
        return X

    def fit(selX, *_):
        return self

ece = EvalClockExpander(num_rows_per_game=10)
train_data = ece.transform(train_data)
test_data = ece.transform(test_data)

# Baseline Creation

In [26]:
prob_baseline_raw = pd.read_csv(r'..\data\baseline-june.csv')

def columns_group(df, game_speed, round_num, number):
    ret = [game_speed, number]
    rows = df.loc[(round_to(df['rating_difference'], round_num) == number) & (df["game_speed"] == game_speed)]
    ret += [rows['total_points'].sum(), rows['total_games'].sum(), rows['total_draws'].sum()]
    return ret

def round_to(x, to):
    return to * round(x/to)

prob_baseline = pd.DataFrame(columns=prob_baseline_raw.columns)

for gs in prob_baseline_raw['game_speed'].unique():
    maximum = round_to(max(prob_baseline_raw[prob_baseline_raw['game_speed'] == gs]['rating_difference']), 100)
    minimum = round_to(min(prob_baseline_raw[prob_baseline_raw['game_speed'] == gs]['rating_difference']), 100)
    for num in range(minimum, maximum, 10):
        prob_baseline.loc[len(prob_baseline)] = columns_group(prob_baseline_raw, gs, 10, num)
        
prob_baseline = prob_baseline[prob_baseline['total_games'] != 0]
prob_baseline['winning_percentage'] = prob_baseline['total_points']/prob_baseline['total_games']
prob_baseline = prob_baseline.drop(columns=['total_points', 'total_games', 'total_draws'])

# Write to Files

In [27]:
train_data.drop(columns=['white_username', 'black_username'], inplace=True)
test_data.drop(columns=['white_username', 'black_username'], inplace=True)

In [28]:
train_data.to_csv(r'..\data\data_train.csv', index=False)
test_data.to_csv(r'..\data\data_test.csv', index=False)
prob_baseline.to_csv(r'..\data\baseline.csv', index=False)