In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.linear_model import LinearRegression

from helper_functions import find_outlier_games

In [2]:
# Import processed data from EDA notebook
bgg = pd.read_csv('bgg_processed.csv', index_col='game_id')

In [3]:
# Split off target column: `avg_rating`
y = bgg['avg_rating']
X = bgg.drop('avg_rating', axis=1).copy()

In [4]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Initial linear regression

First let's see how simple linear regression fairs. There are 2000 entries in the original data, so the train/test split leaves 1600 rows in the training data. 8 splits for K-Fold cross validation should ve sufficient so each slice is 200 rows. The rows were already shuffled with the train/test split, so they don't need shuffled again.

In [5]:
# K-Fold cross validation
cv = KFold(n_splits=8)

model = LinearRegression()

scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='neg_mean_squared_error',
    cv=cv
)

# Find RMSE
rmse = np.mean(np.sqrt(np.absolute(scores)))
rmse

4825688010.338199

Clearly, something isn't optimal. The range between the minimum score and maximum score is 2.7 points. We'd want the average RMSE to be smaller than this range, not 20 billion.

In [6]:
# Look at scores from each split
for i in scores:
    score = np.sqrt(np.absolute(i))
    print(f'Slice {list(scores).index(i)} RMSE: {score}')

Slice 0 RMSE: 0.4454752813646904
Slice 1 RMSE: 0.3831761390119256
Slice 2 RMSE: 0.37414147279706716
Slice 3 RMSE: 0.36228473775631465
Slice 4 RMSE: 0.3181274758904935
Slice 5 RMSE: 0.36160802641093226
Slice 6 RMSE: 0.39168740199763713
Slice 7 RMSE: 38605504080.06909


There is something in the 8th slice that is going wrong. Let's dig in a little to see what it is. First, we need to see what the CV is actually predicting:

In [7]:
# Generate predictions from the cross validation
preds = cross_val_predict(
    model,
    X_train,
    y_train,
    cv=cv
)

In [8]:
preds

array([7.18031593, 7.14230428, 6.73584719, ..., 7.52143596, 7.22272989,
       7.19186905])

In [9]:
print(f'Max prediction: {np.max(preds)}\nMin prediction: {np.min(preds)}')

Max prediction: 545964274530.2242
Min prediction: 5.799880689399389


It looks like the highest prediction is within reason. Ideally, we'd want it below 10, but it isn't too far off. The lowest prediction is causing a problem.

In [10]:
# Find entry of smallest prediction
X_train.iloc[[list(preds).index(np.max(preds))]].index[0]

46213

In [11]:
# Load the names df
names = pd.read_csv('bgg_names.csv', index_col='game_id')

# Find name of game causing the issue
names.loc[46213][1]

'Telestrations'

The model is predicting a very anomalous result for the game called "Telestrations". Is this a result that is specific for 8 k-fold slices?

In [12]:
for i in range(2, 9):
    outliers = find_outlier_games(X_train, y_train, names, i)
    print(f'{i} folds:')
    print('\tOutlier games:')
    print('\t', outliers)

2 folds:
	Outlier games:
	 {}
3 folds:
	Outlier games:
	 {}
4 folds:
	Outlier games:
	 {}
5 folds:
	Outlier games:
	 {}
6 folds:
	Outlier games:
	 {}
7 folds:
	Outlier games:
	 {}
8 folds:
	Outlier games:
	 {}


In [13]:
# Use `cross_validate` instead - may delete later
kf = KFold(n_splits=8)
    
model = LinearRegression()

scores = cross_validate(model, X_train, y_train, scoring='neg_mean_squared_error',
                         cv=kf, return_train_score=True)


In [14]:
np.sqrt(np.absolute(scores['test_score'].mean()))

13649106863.070892

In [15]:
scores

{'fit_time': array([0.12012172, 0.10081029, 0.09701681, 0.09751463, 0.0964694 ,
        0.09616995, 0.09746647, 0.09464836]),
 'score_time': array([0.00398755, 0.00398874, 0.00398946, 0.00398898, 0.00322652,
        0.00456572, 0.00406599, 0.00398946]),
 'test_score': array([-1.98448226e-01, -1.46823954e-01, -1.39981842e-01, -1.31250231e-01,
        -1.01205091e-01, -1.30760365e-01, -1.53419021e-01, -1.49038495e+21]),
 'train_score': array([-0.0816828 , -0.08181892, -0.07979048, -0.08114058, -0.08439798,
        -0.0810409 , -0.07993021, -0.08299909])}