# Phase 4 - Hypothesis Test

In [1]:
#import requests
#from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time

import seaborn
from matplotlib import pyplot

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import PrecisionRecallDisplay, mean_squared_error, mean_absolute_error

import statsmodels.api as sm

%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

In [2]:
chess_games_cleaned = pd.read_csv('chess_games_cleaned.csv')
display(chess_games_cleaned)

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.412060
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.558550
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278
...,...,...,...,...,...,...,...,...,...,...,...,...
601600,601600,0-1,1798,1753,45,B06,modern defense,60+0,Time forfeit,1.0,0,0.480550
601601,601601,0-1,1711,1578,133,B08,pirc defense: classical variation,300+0,Normal,5.0,0,0.457798
601602,601602,1-0,1762,1683,79,C00,st. george defense,300+4,Normal,5.0,4,0.539982
601603,601603,1-0,2023,1742,281,A45,indian game,180+0,Normal,3.0,0,0.450352


# Hypothesis Test 1
#### Elo rating differential will be a more influential factor than opening played in predicting win probability (βelo > βopening)'. 

* Do a regression and test if your coefficients are significant
* Multivariate logistic regression (win/not win), using opening played, elo rating differential, interaction
* Then test coefficients to see if they have a significant p-value


In [3]:
#CLASSIFICATION FOR EACH OPENING --> Open Game, Semi-open game, Semi-Closed game, Closed gam

#Created a column OpenGame - True if open game type, False otherwise
chess_games_cleaned["OpenGame"] = chess_games_cleaned["Opening"].str.contains("portuguese opening") | \
chess_games_cleaned["Opening"].str.contains("centre pawn opening") | \
chess_games_cleaned["Opening"].str.contains("vienna game") | \
chess_games_cleaned["Opening"].str.contains("bishop's opening") | \
chess_games_cleaned["Opening"].str.contains("danish gambit") | \
chess_games_cleaned["Opening"].str.contains("center game") | \
chess_games_cleaned["Opening"].str.contains("alapin's opening") | \
chess_games_cleaned["Opening"].str.contains("ruy lopez") | \
chess_games_cleaned["Opening"].str.contains("ponziani opening") | \
chess_games_cleaned["Opening"].str.contains("three knights game") | \
chess_games_cleaned["Opening"].str.contains("four knights game") | \
chess_games_cleaned["Opening"].str.contains("italian game") | \
chess_games_cleaned["Opening"].str.contains("giuoco piano") | \
chess_games_cleaned["Opening"].str.contains("evans gambit") | \
chess_games_cleaned["Opening"].str.contains("hungarian defense") | \
chess_games_cleaned["Opening"].str.contains("two knights defense") | \
chess_games_cleaned["Opening"].str.contains("scotch game") | \
chess_games_cleaned["Opening"].str.contains("inverted hungarian opening") | \
chess_games_cleaned["Opening"].str.contains("konstantinopolsky opening") | \
chess_games_cleaned["Opening"].str.contains("elephant gambit") | \
chess_games_cleaned["Opening"].str.contains("philidor defense") | \
chess_games_cleaned["Opening"].str.contains("latvian gambit") | \
chess_games_cleaned["Opening"].str.contains("damiano defense") | \
chess_games_cleaned["Opening"].str.contains("petrov's defense") | \
chess_games_cleaned["Opening"].str.contains("greco defense") | \
chess_games_cleaned["Opening"].str.contains("napoleon opening") | \
chess_games_cleaned["Opening"].str.contains("king's gambit") | \
chess_games_cleaned["Opening"].str.contains("king's pawn opening") | \
chess_games_cleaned["Opening"].str.contains("danvers opening") | \
chess_games_cleaned["Opening"].str.contains("bongcloud attack")

#Created a column SemiOpenGame - True if semi-open game type, False otherwise
chess_games_cleaned["SemiOpenGame"] = chess_games_cleaned["Opening"].str.contains("corn stalk defense") | \
chess_games_cleaned["Opening"].str.contains("st. george defense") | \
chess_games_cleaned["Opening"].str.contains("lemming defense") | \
chess_games_cleaned["Opening"].str.contains("owen's defense") | \
chess_games_cleaned["Opening"].str.contains("sicilian defense") | \
chess_games_cleaned["Opening"].str.contains("caro-kann defense") | \
chess_games_cleaned["Opening"].str.contains("nimzowitch defense") | \
chess_games_cleaned["Opening"].str.contains("scandinavian defense") | \
chess_games_cleaned["Opening"].str.contains("balogh defense") | \
chess_games_cleaned["Opening"].str.contains("pirc defense") | \
chess_games_cleaned["Opening"].str.contains("french defense") | \
chess_games_cleaned["Opening"].str.contains("fred defense") | \
chess_games_cleaned["Opening"].str.contains("barnes defense") | \
chess_games_cleaned["Opening"].str.contains("alehkine's defense") | \
chess_games_cleaned["Opening"].str.contains("borg opening") | \
chess_games_cleaned["Opening"].str.contains("modern defense") | \
chess_games_cleaned["Opening"].str.contains("goldsmith defense") | \
chess_games_cleaned["Opening"].str.contains("carr defense") | \
chess_games_cleaned["Opening"].str.contains("adams defense")

#Created a column SemiClosedGame - True if semi-closed game type, False otherwise
chess_games_cleaned["SemiClosedGame"] = chess_games_cleaned["Opening"].str.contains("polish defense") | \
chess_games_cleaned["Opening"].str.contains("benoni defense") | \
chess_games_cleaned["Opening"].str.contains("queen's knight defense") | \
chess_games_cleaned["Opening"].str.contains("wade defense") | \
chess_games_cleaned["Opening"].str.contains("englund gambit") | \
chess_games_cleaned["Opening"].str.contains("english defense") | \
chess_games_cleaned["Opening"].str.contains("keres defense") | \
chess_games_cleaned["Opening"].str.contains("dutch defense") | \
chess_games_cleaned["Opening"].str.contains("indian game") | \
chess_games_cleaned["Opening"].str.contains("nimzo-indian defense") | \
chess_games_cleaned["Opening"].str.contains("queen's indian defense") | \
chess_games_cleaned["Opening"].str.contains("bogo–indian defense") | \
chess_games_cleaned["Opening"].str.contains("blumenfeld countergambit") | \
chess_games_cleaned["Opening"].str.contains("catalan opening") | \
chess_games_cleaned["Opening"].str.contains("king's indian defense") | \
chess_games_cleaned["Opening"].str.contains("benoni defense") | \
chess_games_cleaned["Opening"].str.contains("benko gambit") | \
chess_games_cleaned["Opening"].str.contains("old indian defense") | \
chess_games_cleaned["Opening"].str.contains("budapest gambit") | \
chess_games_cleaned["Opening"].str.contains("modern benoni") | \
chess_games_cleaned["Opening"].str.contains("queen's gambit declined")

#Created a column ClosedGame - True if closed game type, False otherwise
    # NOTE: "Queens Gambit" includes both Queen's gambit accepted and Queen's gambit declined
chess_games_cleaned["ClosedGame"] = chess_games_cleaned["Opening"].str.contains("queen's pawn") | \
chess_games_cleaned["Opening"].str.contains("closed game") | \
chess_games_cleaned["Opening"].str.contains("queen's gambit") | \
chess_games_cleaned["Opening"].str.contains("slav defense") | \
chess_games_cleaned["Opening"].str.contains("stonewall attack") | \
chess_games_cleaned["Opening"].str.contains("colle system") | \
chess_games_cleaned["Opening"].str.contains("richter-veresov attack") | \
chess_games_cleaned["Opening"].str.contains("torre attack") | \
chess_games_cleaned["Opening"].str.contains("symmetrical defense") | \
chess_games_cleaned["Opening"].str.contains("chigorin defense") | \
chess_games_cleaned["Opening"].str.contains("baltic defense") | \
chess_games_cleaned["Opening"].str.contains("marshall defense") | \
chess_games_cleaned["Opening"].str.contains("blackmar-diemer gambit") | \
chess_games_cleaned["Opening"].str.contains("colle system") | \
chess_games_cleaned["Opening"].str.contains("london system")

#display(chess_games.head(50))

chess_games_cleaned.loc[(chess_games_cleaned['SemiClosedGame'] == True), 'GameType'] = 'Semi Closed Game'
chess_games_cleaned.loc[(chess_games_cleaned['SemiOpenGame'] == True), 'GameType'] = 'Semi Open Game'
chess_games_cleaned.loc[(chess_games_cleaned['ClosedGame'] == True), 'GameType'] = 'Closed Game'
chess_games_cleaned.loc[(chess_games_cleaned['OpenGame'] == True), 'GameType'] = 'Open Game'

chess_games_cleaned = chess_games_cleaned.drop(['OpenGame', 'SemiOpenGame', 'SemiClosedGame', 'ClosedGame'], axis = 1)

display(chess_games_cleaned.head(50))

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate,GameType
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376,Closed Game
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.41206,Open Game
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855,Semi Open Game
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695,Semi Open Game
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278,Open Game
5,5,0-1,1895,1886,9,B10,caro-kann defense: two knights attack,180+0,Time forfeit,3.0,0,0.52122,Semi Open Game
6,6,1-0,2155,2356,-201,D02,queen's pawn game: london system,180+0,Normal,3.0,0,0.499276,Closed Game
7,7,0-1,2010,2111,-101,A45,indian game,300+0,Normal,5.0,0,0.450352,Semi Closed Game
8,8,1-0,1764,1773,-9,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855,Semi Open Game
9,9,0-1,1649,1638,11,C57,"italian game: two knights defense, traxler cou...",900+3,Normal,15.0,3,0.408377,Open Game


In [4]:
#Changing Results to Binary
chess_games_cleaned.loc[(chess_games_cleaned['Result'] == '1-0'), 'Result_Binary'] = 1
chess_games_cleaned.loc[(chess_games_cleaned['Result'] == '0-1'), 'Result_Binary'] = 0
#chess_games_cleaned.loc[(chess_games_cleaned['Result'] == '1/2-1/2'), 'Result_Binary'] = 2

chess_games_cleaned

Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate,GameType,Result_Binary
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376,Closed Game,1.0
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.412060,Open Game,0.0
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.558550,Semi Open Game,1.0
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695,Semi Open Game,0.0
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278,Open Game,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
601600,601600,0-1,1798,1753,45,B06,modern defense,60+0,Time forfeit,1.0,0,0.480550,Semi Open Game,0.0
601601,601601,0-1,1711,1578,133,B08,pirc defense: classical variation,300+0,Normal,5.0,0,0.457798,Semi Open Game,0.0
601602,601602,1-0,1762,1683,79,C00,st. george defense,300+4,Normal,5.0,4,0.539982,Semi Open Game,1.0
601603,601603,1-0,2023,1742,281,A45,indian game,180+0,Normal,3.0,0,0.450352,Semi Closed Game,1.0


## Significance of Coefficients for Elo Rating Difference and Opening Played

* Testing to see whether coefficients for Elo Rating Difference and Opening Played are significant in predicting win probability

In [5]:
#Drop NaNs in GameType and Result_columns
chess_games_cleaned = chess_games_cleaned.dropna(subset = ['GameType', 'Result_Binary'])

print(chess_games_cleaned['Result_Binary'].unique())
print(chess_games_cleaned['GameType'].unique())

#Creating dummy variables for each of the game type then dropping 1 game type
gametype_dummies = pd.get_dummies(data = chess_games_cleaned['GameType'])
chess_games_cleaned = pd.concat([chess_games_cleaned, gametype_dummies], axis = 1)
chess_games_cleaned = chess_games_cleaned.drop(['Open Game'], axis = 1)
display(chess_games_cleaned.head())

[1. 0.]
['Closed Game' 'Open Game' 'Semi Open Game' 'Semi Closed Game']


Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base (min),Increment (sec),Win_Rate,GameType,Result_Binary,Closed Game,Semi Closed Game,Semi Open Game
0,0,1-0,1901,1896,5,D10,slav defense,300+5,Time forfeit,5.0,5,0.490376,Closed Game,1.0,1,0,0
1,1,0-1,1641,1627,14,C20,king's pawn opening: 2.b3,300+0,Normal,5.0,0,0.41206,Open Game,0.0,0,0,0
2,2,1-0,1647,1688,-41,B01,scandinavian defense: mieses-kotroc variation,180+0,Time forfeit,3.0,0,0.55855,Semi Open Game,1.0,0,0,1
3,3,0-1,1945,1900,45,B90,"sicilian defense: najdorf, lipnitsky attack",180+0,Time forfeit,3.0,0,0.444695,Semi Open Game,0.0,0,0,1
4,4,0-1,1773,1809,-36,C27,vienna game,180+0,Normal,3.0,0,0.579278,Open Game,0.0,0,0,0


In [6]:
#creating a train/test split
chess_train, chess_test = train_test_split(chess_games_cleaned, test_size = 0.2, random_state = 2950)

#define a function for running logistic regressions
def run_logistic_regression (input_var, train, test, predicted_var):
    #creating input and output arrays for train and test set
    train_x = train[input_var]
    train_y = train[predicted_var]
    test_y = test[predicted_var]
    
    #training the model with input and output arrays for train set
    model = LogisticRegression().fit(train_x, train_y)
    
    #found the model's coefficient and intercept
    coefficient = model.coef_
    intercept = model.intercept_
    
    #making predictions of the ouput for train and test set
    train_prediction = model.predict(train[input_var])
    test_prediction = model.predict(test[input_var])

    #calculated accuracy, precision, recall, and f1 scores for train and test sets
    train_accuracy = metrics.accuracy_score(train_y, train_prediction)
    test_accuracy = metrics.accuracy_score(test_y, test_prediction)
    
    train_precision = metrics.precision_score(train_y, train_prediction)
    test_precision = metrics.precision_score(test_y, test_prediction)
    
    train_recall = metrics.recall_score(train_y, train_prediction)
    test_recall = metrics.recall_score(test_y, test_prediction)
    
    train_f1 = metrics.f1_score(train_y, train_prediction)
    test_f1 = metrics.f1_score(test_y, test_prediction)
    
    #printed all required elements with appropriate rounding
    print('Input Variables: ' + str(input_var))
    print('Coefficient: ' + str(np.round(coefficient,6)))
    print('Intercept: ' + str(np.round(intercept,3)[0]))
    print('Train Accuracy: ' + str(np.round(train_accuracy,3)))
    print('Test Accuracy: ' + str(np.round(test_accuracy,3)))
    print('Train Precision: ' + str(np.round(train_precision,3)))
    print('Test Precision: ' + str(np.round(test_precision, 3)))
    print('Train Recall: ' + str(np.round(train_recall, 3)))
    print('Test Recall: ' + str(np.round(test_recall, 3)))
    print('Train_F1: ' + str(np.round(train_f1, 3)))
    print('Test_F1: ' + str(np.round(test_f1,3)))

In [7]:
#running a logistic regression on Elo rating differential in predicting win outcome (win or loss)
run_logistic_regression(['WhiteRatingDiff'], chess_train, chess_test, 'Result_Binary')

Input Variables: ['WhiteRatingDiff']
Coefficient: [[0.004797]]
Intercept: 0.074
Train Accuracy: 0.654
Test Accuracy: 0.651
Train Precision: 0.656
Test Precision: 0.653
Train Recall: 0.696
Test Recall: 0.692
Train_F1: 0.676
Test_F1: 0.672


In [8]:
#running a logistic regression on the different opening types in predicting win outcome (win or loss)
run_logistic_regression(['Closed Game', 'Semi Closed Game', 'Semi Open Game'], chess_train, chess_test, 'Result_Binary')

Input Variables: ['Closed Game', 'Semi Closed Game', 'Semi Open Game']
Coefficient: [[-0.02472  -0.16083  -0.223192]]
Intercept: 0.19
Train Accuracy: 0.525
Test Accuracy: 0.524
Train Precision: 0.538
Test Precision: 0.536
Train Recall: 0.578
Test Recall: 0.576
Train_F1: 0.557
Test_F1: 0.555


In order to predict which model is a better determinant of win probability (rating differential or opening type), we need to look at the accuracy rating of each model. Comparing the accuracy of the rating differential model (65.1%), it exceeds the accuracy of the game type model (52.4%). Furthermore, in looking at the other 3 metrics, precision, recall, and f1, which indicates the effectiveness of the model, the metrics are all higher for the rating differential model (65.3%, 69.2%, 67.2% respectively) compared to the accuracy model (53.6%, 57.6%, 55.5%).

In [9]:
#running a logistic regression on the opening type played in predicting win outcome (win or loss)
#run_logistic_regression(['WhiteRatingDiff','Closed Game', 'Semi Closed Game', 'Semi Open Game'], chess_train, chess_test, 'Result_Binary')

In [23]:
def run_OLS (input_var, train, test, predicted_var):
    #creating input and output arrays for train and test set
    train_x = train[input_var]
    train_y = train[predicted_var]
    test_y = test[predicted_var]
    
    #training the model with input and output arrays for train set
    model =  sm.OLS(train_y, train_x).fit()
    print(model.summary())
    

In [24]:
run_OLS(['WhiteRatingDiff'], chess_train, chess_test, 'Result_Binary')

                                 OLS Regression Results                                
Dep. Variable:          Result_Binary   R-squared (uncentered):                   0.071
Model:                            OLS   Adj. R-squared (uncentered):              0.071
Method:                 Least Squares   F-statistic:                          2.567e+04
Date:                Mon, 21 Nov 2022   Prob (F-statistic):                        0.00
Time:                        19:48:04   Log-Likelihood:                     -3.5142e+05
No. Observations:              333912   AIC:                                  7.028e+05
Df Residuals:                  333911   BIC:                                  7.029e+05
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

In [20]:
run_OLS(['Closed Game', 'Semi Closed Game', 'Semi Open Game'], chess_train, chess_test, 'Result_Binary')

                                 OLS Regression Results                                
Dep. Variable:          Result_Binary   R-squared (uncentered):                   0.372
Model:                            OLS   Adj. R-squared (uncentered):              0.372
Method:                 Least Squares   F-statistic:                          6.591e+04
Date:                Mon, 21 Nov 2022   Prob (F-statistic):                        0.00
Time:                        19:47:15   Log-Likelihood:                     -2.8613e+05
No. Observations:              333912   AIC:                                  5.723e+05
Df Residuals:                  333909   BIC:                                  5.723e+05
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                       coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------

# Hypothesis Test 2
#### A time constraint less than 10 minutes will amplify the impact of rating differential in predicting win probability by at least 1.5 times compared to games with time constraints greater than or equal to 10 minutes. (βrating diff with constraint < 10 Minutes / βrating diff with constraint > 10 Minutes >= 1.5)

* Single var regression for elo difference vs win prob, data =  constraint < 10 minutes
* Single var regression for elo difference vs win prob, data = constraint > 10 minutes
* Compare coefficients


In [10]:
#makingthe base column filterable in sql
chess_games_cleaned = chess_games_cleaned.rename(columns = {"Base (min)": "Base"})

#filter for over/under 10 min
%sql constraint_under_10 << SELECT * FROM chess_games_cleaned WHERE Base < 10
%sql constraint_over_10 << SELECT * FROM chess_games_cleaned WHERE Base >= 10

constraint_over_10.head()

Returning data to local variable constraint_under_10
Returning data to local variable constraint_over_10


Unnamed: 0.1,Unnamed: 0,Result,WhiteElo,BlackElo,WhiteRatingDiff,ECO,Opening,TimeControl,Termination,Base,Increment (sec),Win_Rate,GameType,Result_Binary,Closed Game,Semi Closed Game,Semi Open Game
0,9,0-1,1649,1638,11,C57,"italian game: two knights defense, traxler cou...",900+3,Normal,15.0,3,0.408377,Open Game,0.0,0,0,0
1,31,1-0,1408,1405,3,C00,french defense: normal variation,600+0,Normal,10.0,0,0.550811,Semi Open Game,1.0,0,0,1
2,36,1-0,2215,2072,143,C17,"french defense: winawer variation, advance var...",600+0,Normal,10.0,0,0.543689,Semi Open Game,1.0,0,0,1
3,37,1-0,1580,1571,9,D02,queen's pawn game: zukertort variation,600+0,Normal,10.0,0,0.540344,Closed Game,1.0,1,0,0
4,41,0-1,2053,2056,-3,D00,queen's pawn game: chigorin variation,600+0,Normal,10.0,0,0.514093,Closed Game,0.0,1,0,0


In [11]:
under_ten_train, under_ten_test = train_test_split(constraint_under_10, test_size = 0.30, random_state = 2950) 

over_ten_train, over_ten_test = train_test_split(constraint_over_10, test_size = 0.30, random_state = 2950)

In [12]:
#taken from my (Aarya's) HW
def run_regression(name_list, training_set, testing_set, target_name):
    #display(training_set)
   
    X = training_set[name_list]
    #print(X)
    array_X = np.array(X)
    #print(array_X.shape)
    array_X = array_X.reshape(-1,1)
    #print(array_X.shape)
    #print(array_X)
    y = training_set[target_name]
    #print(y)
    
    test_X = testing_set[name_list]
    #print(test_X)
    array_test_X = np.array(test_X)
    #print(array_test_X.shape)
    array_test_X = array_test_X.reshape(-1,1)

    
    diamonds_regression = LinearRegression().fit(array_X, y)
    
    y_hat_train = diamonds_regression.predict(array_X)
    y_hat_test = diamonds_regression.predict(array_test_X)
    
    train_mse = mean_squared_error(training_set["Result_Binary"], y_hat_train)
    test_mse = mean_squared_error(testing_set["Result_Binary"], y_hat_test)

    #print("List of variable names: " + name_list)
    
    print("Coefficients: ")
    print(diamonds_regression.coef_)
    print("Intercept: ")
    print(diamonds_regression.intercept_)
    
    print("Train RMSE: ")
    train_rmse = np.sqrt(train_mse)
    print(train_rmse)
    print("Test RMSE: ")
    test_rmse = np.sqrt(test_mse)
    print(test_rmse)
    
    print("Train MAE: ")
    train_mae = mean_absolute_error(training_set["Result_Binary"], y_hat_train)
    print(train_mae)
    print("Test MAE: ")
    test_mae = mean_absolute_error(testing_set["Result_Binary"], y_hat_test)
    print(test_mae)
    

In [13]:
print("Under 10 Regression")
run_regression("WhiteRatingDiff", under_ten_train, under_ten_test, "Result_Binary")
print(" ")
print("Over 10 Regression")
run_regression("WhiteRatingDiff", over_ten_train, over_ten_test, "Result_Binary")

Under 10 Regression
Coefficients: 
[0.00093595]
Intercept: 
0.5143653083178812
Train RMSE: 
0.4627760742421617
Test RMSE: 
0.46182001883823326
Train MAE: 
0.43285154254747127
Test MAE: 
0.43179486533282896
 
Over 10 Regression
Coefficients: 
[0.00091936]
Intercept: 
0.5210196082072163
Train RMSE: 
0.46516605535446004
Test RMSE: 
0.4647181644082476
Train MAE: 
0.43633627357955007
Test MAE: 
0.4356447052867652


In [14]:
print(0.00093595/0.00091936)

1.0180451618517228
