In [None]:
import chess
from ChessWrapper import ChessWrapper
from copy import deepcopy
from evaluation import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

In [None]:
%load_ext autoreload
%autoreload 2

### Read in the evaluation dataset, parse it, and create the features

In [None]:
sf_data = pd.read_csv('chessData2.csv', encoding='utf-8')

In [None]:
# parse checkmate (#) in Evaluation
def parse_eval(ev):
    # if checkmate, produce large evaluation
    if ev[0] == '#':
        ev = ev[1:] + '000'
    ev = eval(ev)
    return ev

In [None]:
sf_data['Evaluation'] = sf_data['Evaluation'].apply(parse_eval)

In [None]:
sf_data.head(n=500)

In [None]:
def create_features(data_df):
    features_df = pd.DataFrame()
    counter = 0
    for idx, row in data_df.iterrows():
        board = ChessWrapper(row['FEN'])

        b_atk, w_ps = king_safety(board, chess.WHITE)
        w_atk, b_ps = king_safety(board, chess.BLACK)
    
        new_row = pd.DataFrame(
            {
                'FEN': row['FEN'],
                'tapered_eval': [tapered_eval(board)],
                'king_atk': [w_atk - b_atk],
                'mobility' : [mobility(board)],
                'pawn_shield': [w_ps - b_ps],
                'pawn_islands' : [pawn_islands(board, chess.WHITE) - pawn_islands(board, chess.BLACK)],
                'doubled_pawns' : [doubled_pawns(board, chess.WHITE) - doubled_pawns(board, chess.BLACK)],
                'passed_pawns' : [passers(board, chess.WHITE) - passers(board, chess.BLACK)],
                'sf_evaluation': row['Evaluation']
            }
        )

        features_df = pd.concat([features_df, new_row])
        counter += 1
        if counter % 1000 == 0:
            print(counter)
    return features_df.reset_index(drop=True)

In [None]:
reg_data = sf_data.sample(n=200000, random_state=0)

In [None]:
try:
    reg_feat = pd.read_csv('reg_feat.csv')
    reg_feat = reg_feat.drop(columns=['Unnamed: 0'])
except:
    reg_feat = create_features(reg_data)
    reg_feat.to_csv('reg_feat.csv')

In [None]:
reg_feat

In [None]:
reg_feat = reg_feat.drop(columns=['FEN'])

In [None]:
reg_feat.describe()

In [None]:
reg_feat['sf_evaluation'].quantile(.02)

In [None]:
reg_feat['sf_evaluation'].quantile(.98)

In [None]:
reg_feat['sf_evaluation'] = reg_feat['sf_evaluation'].clip(lower=-1500, upper=1500)

In [None]:
reg_feat['tapered_eval'] = reg_feat['tapered_eval'].clip(lower=-1500, upper=1500)

In [None]:
# try squared features
reg_feat['mobility_2'] = np.square(reg_feat['mobility']) * np.sign(reg_feat['mobility'])
reg_feat['pawn_islands_2'] = np.square(reg_feat['pawn_islands']) * np.sign(reg_feat['pawn_islands'])
reg_feat['doubled_pawns_2'] = np.square(reg_feat['doubled_pawns']) * np.sign(reg_feat['doubled_pawns'])
reg_feat['passed_pawns_2'] = np.square(reg_feat['passed_pawns']) * np.sign(reg_feat['passed_pawns'])
reg_feat['pawn_shield_2'] = np.square(reg_feat['pawn_shield']) * np.sign(reg_feat['pawn_shield'])
reg_feat['king_atk_2'] = np.square(reg_feat['king_atk']) * np.sign(reg_feat['king_atk'])

### Run Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
# try all features
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(reg_feat.drop(columns='sf_evaluation'), reg_feat['sf_evaluation'], test_size=.25, random_state=42)

lin_reg = LinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)

y_train_pred_linreg = lin_reg.predict(X_train_reg)
y_test_pred_linreg = lin_reg.predict(X_test_reg)

print(mean_absolute_error(y_train_reg, y_train_pred_linreg))
print(mean_absolute_error(y_test_reg, y_test_pred_linreg))

In [None]:
features = X_train_reg.columns.values

importances = lin_reg.coef_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('coefficients')
plt.show()

for i in indices:
    print(importances[i], features[i])

In [None]:
# Use only the degree 2 features
reg_feat2 = reg_feat.drop(columns=['mobility', 'king_atk', 'pawn_shield', 'passed_pawns','doubled_pawns', 'pawn_islands'])

In [None]:
# Use only the degree 1 features
reg_feat1 = reg_feat.drop(columns=['mobility_2', 'king_atk_2', 'pawn_shield_2', 'passed_pawns_2','doubled_pawns_2', 'pawn_islands_2'])

In [None]:
# train on only degree 2 features
X_train_reg2, X_test_reg2, y_train_reg2, y_test_reg2 = train_test_split(reg_feat2.drop(columns='sf_evaluation'), reg_feat2['sf_evaluation'], test_size=.25, random_state=42)


lin_reg2 = LinearRegression()

lin_reg2.fit(X_train_reg2, y_train_reg2)


y_train_pred_linreg2 = lin_reg2.predict(X_train_reg2)
y_test_pred_linreg2 = lin_reg2.predict(X_test_reg2)


print(mean_absolute_error(y_train_reg2, y_train_pred_linreg2))
print(mean_absolute_error(y_test_reg2, y_test_pred_linreg2))

In [None]:
print(mean_squared_error(y_train_reg2, y_train_pred_linreg2))
print(mean_squared_error(y_test_reg2, y_test_pred_linreg2))

In [None]:
importances = lin_reg2.coef_
indices = np.argsort(importances)

features = X_train_reg2.columns.values

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('coefficients')
plt.show()

for i in indices:
    print(importances[i], features[i])

In [None]:
# try only the degree 1 features
X_train_reg1, X_test_reg1, y_train_reg1, y_test_reg1 = train_test_split(reg_feat1.drop(columns='sf_evaluation'), reg_feat1['sf_evaluation'], test_size=.25, random_state=42)


lin_reg1 = LinearRegression()

lin_reg1.fit(X_train_reg1, y_train_reg1)


y_train_pred_linreg1 = lin_reg1.predict(X_train_reg1)
y_test_pred_linreg1 = lin_reg1.predict(X_test_reg1)


print(mean_absolute_error(y_train_reg1, y_train_pred_linreg1))
print(mean_absolute_error(y_test_reg1, y_test_pred_linreg1))

In [None]:
importances = lin_reg1.coef_
indices = np.argsort(importances)

features = X_train_reg1.columns.values

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('coefficients')
plt.show()

for i in indices:
    print(importances[i], features[i])

In [None]:
# save the linear regression model
pickle.dump(lin_reg1, open('lr_eval.pkl', 'wb'))

In [None]:
print(X_train_reg1.columns.values)

In [None]:
X_train_reg1

In [None]:
print(mean_squared_error(X_train_reg1['tapered_eval'], y_train_reg1))
print(mean_squared_error(X_test_reg1['tapered_eval'], y_test_reg1))

print(mean_absolute_error(X_train_reg1['tapered_eval'], y_train_reg1))
print(mean_absolute_error(X_test_reg1['tapered_eval'], y_test_reg1))