In [146]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.linear_model import LinearRegression

In [18]:
# Import processed data from EDA notebook
bgg = pd.read_csv('bgg_processed.csv', index_col='game_id')

In [17]:
# Split off target column: `avg_rating`
y = bgg['avg_rating']
X = bgg.drop('avg_rating', axis=1).copy()

In [25]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Initial linear regression

First let's see how simple linear regression fairs. There are 2000 entries in the original data, so the train/test split leaves 1600 rows in the training data. 8 splits for K-Fold cross validation should ve sufficient so each slice is 200 rows. The rows were already shuffled with the train/test split, so they don't need shuffled again.

In [137]:
# K-Fold cross validation
cv = KFold(n_splits=8)

model = LinearRegression()

scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='neg_mean_squared_error',
    cv=cv
)

# Find RMSE
rmse = np.mean(np.sqrt(np.absolute(scores)))
rmse

20290175055.225716

Clearly, something isn't optimal. The range between the minimum score and maximum score is 2.7 points. We'd want the average RMSE to be smaller than this range, not 20 billion.

In [155]:
# Look at scores from each split
for i in scores:
    score = np.sqrt(np.absolute(i))
    print(f'Slice {list(scores).index(i)} RMSE: {score}')

Slice 0 RMSE: 0.44547528136469133
Slice 1 RMSE: 0.3831761390119257
Slice 2 RMSE: 0.3741414727970673
Slice 3 RMSE: 0.36228473775631453
Slice 4 RMSE: 0.31812747589049367
Slice 5 RMSE: 0.3616080264109322
Slice 6 RMSE: 0.39168740199763735
Slice 7 RMSE: 162321400439.16922


There is something in the 8th slice that is going wrong. Let's dig in a little to see what it is. First, we need to see what the CV is actually predicting:

In [192]:
# Generate predictions from the cross validation
preds = cross_val_predict(
    model,
    X_train,
    y_train,
    cv=cv
)

In [194]:
print(f'Max prediction: {np.max(preds)}\nMin prediction: {np.min(preds)}')

Max prediction: 11.409906691914584
Min prediction: -2295571259637.283


It looks like the highest prediction is within reason. Ideally, we'd want it below 10, but it isn't too far off. The lowest prediction is causing a problem.

In [207]:
# Find entry of smallest prediction
X_train.iloc[[list(preds).index(np.min(preds))]]

Unnamed: 0_level_0,min_players,max_players,avg_time,weight,age,Acting,Action Drafting,Action Points,Action Queue,Action Retrieval,...,World War II,Zombies,Abstract Games,Children's Games,Customizable Games,Family Games,Party Games,Strategy Games,Thematic Games,Wargames
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46213,3.064573,0.451541,-0.217288,-1.790141,0.301476,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [224]:
# Load the names df
names = pd.read_csv('bgg_names.csv', index_col='game_id')

# Find name of game causing the issue
names.loc[46213][1]

'Telestrations'

In [228]:
# Use `cross_validate` instead - may delete later
kf = KFold(n_splits=8)
    
model = LinearRegression()

scores = cross_validate(model, X_train, y_train, scoring='neg_mean_squared_error',
                         cv=kf, return_train_score=True)


In [229]:
np.sqrt(np.absolute(scores['test_score'].mean()))

57389281491.1168

In [230]:
scores

{'fit_time': array([0.01361346, 0.01625967, 0.01560736, 0.01716471, 0.01502013,
        0.01451659, 0.0158937 , 0.01537204]),
 'score_time': array([0.00245476, 0.00214791, 0.00206614, 0.00219345, 0.00214076,
        0.00240612, 0.00197291, 0.0019629 ]),
 'test_score': array([-1.98448226e-01, -1.46823954e-01, -1.39981842e-01, -1.31250231e-01,
        -1.01205091e-01, -1.30760365e-01, -1.53419021e-01, -2.63482370e+22]),
 'train_score': array([-0.0816828 , -0.08181892, -0.07979048, -0.08114058, -0.08439798,
        -0.0810409 , -0.07993021, -0.08299983])}