In [23]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay, mean_squared_error, mean_absolute_error

import seaborn
import matplotlib.pyplot as plt

%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [24]:
chess_games_cleaned = pd.read_csv('chess_games_cleaned.csv')
display(chess_games_cleaned)

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.412060
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.558550
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278
...,...,...,...,...,...,...,...,...,...,...,...,...
601600,601600,0-1,1798,1753,45,B06,modern defense,60+0,Time forfeit,1.0,0,0.480550
601601,601601,0-1,1711,1578,133,B08,pirc defense: classical variation,300+0,Normal,5.0,0,0.457798
601602,601602,1-0,1762,1683,79,C00,st. george defense,300+4,Normal,5.0,4,0.539982
601603,601603,1-0,2023,1742,281,A45,indian game,180+0,Normal,3.0,0,0.450352


In [25]:
#makingthe base column filterable in sql
chess_games_cleaned = chess_games_cleaned.rename(columns = {"Base (min)": "Base"})

#filter for over/under 10 min
%sql constraint_under_10 << SELECT * FROM chess_games_cleaned WHERE Base < 10
%sql constraint_over_10 << SELECT * FROM chess_games_cleaned WHERE Base >= 10

constraint_over_10.head()

Returning data to local variable constraint_under_10
Returning data to local variable constraint_over_10


Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base,Increment (sec),Win_Rate
0,9,0-1,1649,1638,11,C57,"italian game: two knights defense, traxler cou...",900+3,Normal,15.0,3,0.408377
1,22,0-1,1747,1731,16,C46,three knights opening #2,600+0,Normal,10.0,0,0.477459
2,27,1-0,1624,1614,10,C42,russian game: classical attack,600+0,Normal,10.0,0,0.554545
3,31,1-0,1408,1405,3,C00,french defense: normal variation,600+0,Normal,10.0,0,0.550811
4,36,1-0,2215,2072,143,C17,"french defense: winawer variation, advance var...",600+0,Normal,10.0,0,0.543689


In [26]:
under_ten_train, under_ten_test = train_test_split(constraint_under_10, test_size = 0.30, random_state = 2950) 

over_ten_train, over_ten_test = train_test_split(constraint_over_10, test_size = 0.30, random_state = 2950)

In [34]:
#taken from my (Aarya's) HW
def run_regression(name_list, training_set, testing_set, target_name):
    #display(training_set)
   
    X = training_set[name_list]
    #print(X)
    array_X = np.array(X)
    #print(array_X.shape)
    array_X = array_X.reshape(-1,1)
    #print(array_X.shape)
    #print(array_X)
    y = training_set[target_name]
    #print(y)
    
    test_X = testing_set[name_list]
    #print(test_X)
    array_test_X = np.array(test_X)
    #print(array_test_X.shape)
    array_test_X = array_test_X.reshape(-1,1)

    
    diamonds_regression = LinearRegression().fit(array_X, y)
    
    y_hat_train = diamonds_regression.predict(array_X)
    y_hat_test = diamonds_regression.predict(array_test_X)
    
    train_mse = mean_squared_error(training_set["Win_Rate"], y_hat_train)
    test_mse = mean_squared_error(testing_set["Win_Rate"], y_hat_test)

    #print("List of variable names: " + name_list)
    
    print("Coefficients: ")
    print(diamonds_regression.coef_)
    print("Intercept: ")
    print(diamonds_regression.intercept_)
    
    print("Train RMSE: ")
    train_rmse = np.sqrt(train_mse)
    print(train_rmse)
    print("Test RMSE: ")
    test_rmse = np.sqrt(test_mse)
    print(test_rmse)
    
    print("Train MAE: ")
    train_mae = mean_absolute_error(training_set["Win_Rate"], y_hat_train)
    print(train_mae)
    print("Test MAE: ")
    test_mae = mean_absolute_error(testing_set["Win_Rate"], y_hat_test)
    print(test_mae)

In [35]:
print("Under 10 Regression")
run_regression("WhiteRatingDiff", under_ten_train, under_ten_test, "Win_Rate")
print(" ")
print("Over 10 Regression")
run_regression("WhiteRatingDiff", over_ten_train, over_ten_test, "Win_Rate")

Under 10 Regression
Coefficients: 
[4.25035546e-05]
Intercept: 
0.4952985473891122
Train RMSE: 
0.05465571661028837
Test RMSE: 
0.054917892081754
Train MAE: 
0.04334140998195629
Test MAE: 
0.04352380451038552
 
Over 10 Regression
Coefficients: 
[5.41939105e-05]
Intercept: 
0.49977540173284063
Train RMSE: 
0.059807349561586926
Test RMSE: 
0.05957777252627818
Train MAE: 
0.04738652185389413
Test MAE: 
0.04713244285624221


In [38]:
print((4.25035546 * (10 ** -5))/(5.41939105* (10 ** -5)))

0.7842865408282357
