In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
import os
os.chdir('/Users/justinross/Documents/BYU/stat486/CallofDuty')

In [2]:
# read in data
cod = pd.read_csv("cod.csv")
cod = cod.drop('name', axis=1)

# feature engineering

# Calculate Accuracy by dividing hits by shots, handling division by zero
mask = cod['shots'] != 0  # Create a mask for non-zero shots
cod.loc[mask, 'Accuracy'] = cod['hits'] / cod['shots']
cod.loc[~mask, 'Accuracy'] = pd.NA  # Set Accuracy to pd.NA for zero shots

# Calculate Headshot Ratio by dividing headshots by kills, handling division by zero
mask = cod['kills'] != 0  # Create a mask for non-zero shots
cod.loc[mask, 'Headshot Ratio'] = cod['headshots'] / cod['kills']
cod.loc[~mask, 'Headshot Ratio'] = pd.NA  # Set Accuracy to pd.NA for zero shots

X = cod.drop('wins', axis=1)
y = cod['wins']

# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=102)

In [3]:
# grid search to tune hyperparameters

pipe = Pipeline([
  ('impute', SimpleImputer()),
  ('poly', PolynomialFeatures(include_bias = True)),
  ('standard', StandardScaler()),
  ('model', KNeighborsRegressor())
])

params = {
  'impute__strategy':('mean','median'), 
  'poly__degree':(1,2,3),
  'model__n_neighbors': list(range(5, 101, 5)),
  'model__weights': ['uniform', 'distance']
}

gs = GridSearchCV(pipe, param_grid = params, scoring = 'neg_mean_squared_error', cv = 10)
gs.fit(X_train, y_train)

# best hyperparameter combinations
best_params = gs.best_params_
print(f'Best parameters: {best_params} \n')

# best MSE
best_mse = -gs.best_score_ 
print(f'Best MSE: {best_mse} \n')

best_model = gs.best_estimator_
preds = best_model.predict(X_test)

# test MSE
test_mse = mean_squared_error(y_test, preds)
print(f'Test MSE: {test_mse}')
print(f'Test RMSE: {np.sqrt(test_mse)}')

Best parameters: {'impute__strategy': 'median', 'model__n_neighbors': 5, 'model__weights': 'uniform', 'poly__degree': 1} 

Best MSE: 12651.044518420278 

Test MSE: 11609.73558974359
Test RMSE: 107.74848300437269


In [4]:
# implement the model with the tuned hyperparameters
pipe = Pipeline([
  ('impute', SimpleImputer(strategy = 'mean')),
  ('standard', StandardScaler()),
  ('model', KNeighborsRegressor(n_neighbors=5, weights='uniform'))
])

# fit pipeline to training data
pipe.fit(X_train, y_train)

# training MSE
train_preds = pipe.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Using KNN, the Train MSE is: {mse_train}")

# test MSE
test_preds = pipe.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Using KNN, the Test MSE is: {mse_test}")
print(f"Using KNN, the Test RMSE is: {np.sqrt(mse_test)} \n")

print(f"The variance of y_test is {np.var(y_test)}")
print(f"The sd of y_test is {np.std(y_test)} \n")

Using KNN, the Train MSE is: 8123.9631849315065
Using KNN, the Test MSE is: 11612.901333333331
Using KNN, the Test RMSE is: 107.76317243536091 

The variance of y_test is 74958.43498356345
The sd of y_test is 273.78538124517064 



This KNN model is overfitting, as the train MSE (8125) is lower than the Test MSE (11609). I will now try some regularization methods to combat overfitting.

In [5]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV

process = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('standard', StandardScaler())
])

X_train = process.fit_transform(X_train)
X_test = process.transform(X_test)

alpha_values = np.logspace(-6, 6, 30)
ridge = RidgeCV(alphas = alpha_values, cv=10)
ridge.fit(X_train, y_train)

#(b) What is the optimal value for alpha?
print(f"Using Ridge, the optimal alpha value is {ridge.alpha_}")

# What is the train MSE?
preds = ridge.predict(X_train)
trainMSE = mean_squared_error(y_train, preds)
print(f"Using Ridge, the Train MSE is {trainMSE}")
print(f"Using Ridge, the Train RMSE is: {np.sqrt(trainMSE)} \n")

#(c) What is the test MSE?
preds = ridge.predict(X_test)
testMSE = mean_squared_error(y_test, preds)
print(f"Using Ridge, the Test MSE is: {testMSE}")
print(f"Using Ridge, the Test RMSE is: {np.sqrt(testMSE)} \n")
coefficients = ridge.coef_
feature_names = X.columns.tolist()

for feature_name, coefficient in zip(feature_names, coefficients):
    print(f"{feature_name}: {coefficient:.4f}")

Using Ridge, the optimal alpha value is 10.82636733874054
Using Ridge, the Train MSE is 4814.805932744393
Using Ridge, the Train RMSE is: 69.38880264671234 

Using Ridge, the Test MSE is: 7282.26717740987
Using Ridge, the Test RMSE is: 85.33620086112265 

kills: 37.6757
kdRatio: 0.3005
killstreak: 6.9900
level: 13.1615
losses: -7.4831
prestige: -1.2794
hits: -12.2836
timePlayed: 180.7026
headshots: -10.9353
averageTime: -10.9021
gamesPlayed: 116.6350
assists: -49.7065
misses: 14.2236
xp: -42.2006
scorePerMinute: 3.9169
shots: 9.1478
deaths: 48.2931
Accuracy: 3.1387
Headshot Ratio: -0.8850


In [6]:
lasso = LassoCV(cv=10)
lasso.fit(X_train, y_train)

# What is the optimal value for alpha?
print()
print(f"Using Lasso, the optimal alpha value is {lasso.alpha_}")

#(c) What is the train MSE?
preds = lasso.predict(X_train)
trainMSE = mean_squared_error(y_train, preds)
print(f"Using Lasso, the Train MSE is {trainMSE}")
print(f"Using Lasso, the Train RMSE is: {np.sqrt(trainMSE)} \n")

# What is the test MSE?
preds = lasso.predict(X_test)
testMSE = mean_squared_error(y_test, preds)
print(f"Using Lasso, the Test MSE is {testMSE}")
print(f"Using Lasso, the Test RMSE is: {np.sqrt(testMSE)} \n")


Using Lasso, the optimal alpha value is 0.7227887153497466
Using Lasso, the Train MSE is 4847.7136067630945
Using Lasso, the Train RMSE is: 69.62552410404602 

Using Lasso, the Test MSE is 7172.186372166999
Using Lasso, the Test RMSE is: 84.68876178199206 



In [7]:
# lets try SVM
from sklearn.svm import SVC
pipe = Pipeline([
  ('impute', SimpleImputer(strategy = 'mean')),
  ('standard', StandardScaler()),
  ('model', SVC())
])

pipe.fit(X_train, y_train)
# training MSE
train_preds = pipe.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

# test MSE
test_preds = pipe.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

print(f"The variance of y_test is {np.var(y_test)} \n")
print(f"The sd of y_test is {np.std(y_test)} \n")

Train MSE: 19222.342465753423 

Test MSE: 53643.071794871794 

Test RMSE: 231.60974028497117 

The variance of y_test is 74958.43498356345 

The sd of y_test is 273.78538124517064 



In [8]:
# grid search to tune hyperparameters

pipe = Pipeline([
  ('impute', SimpleImputer()),
  ('poly', PolynomialFeatures(include_bias = True)),
  ('standard', StandardScaler()),
  ('model', SVC())
])

params = {
    'impute__strategy': ('mean', 'median'),
    'model__C': [0.05, 0.1, 1],  # Regularization parameter
    'model__kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'model__gamma': ['scale', 'auto']  # Kernel coefficient (for 'rbf' and 'poly')
}

gs = GridSearchCV(pipe, param_grid = params, scoring = 'neg_mean_squared_error', cv = 10)
gs.fit(X_train, y_train)

# best hyperparameter combinations
best_params = gs.best_params_
print(f'Best parameters: {best_params} \n')

# best MSE
best_mse = -gs.best_score_ 
print(f'Best MSE: {best_mse} \n')

best_model = gs.best_estimator_
preds = best_model.predict(X_test)

# test MSE
test_mse = mean_squared_error(y_test, preds)
print(f'Test MSE: {test_mse}')
print(f'Test RMSE: {np.sqrt(test_mse)}')



Best parameters: {'impute__strategy': 'mean', 'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'linear'} 

Best MSE: 21896.415237253164 

Test MSE: 16350.502564102564
Test RMSE: 127.86908369149505
