In [2]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
import os
os.chdir('/Users/justinross/Documents/BYU/stat486/CallofDuty')

In [3]:
# read in data
cod = pd.read_csv("cod.csv")
cod = cod.drop('name', axis=1)

# feature engineering

# Calculate Accuracy by dividing hits by shots, handling division by zero
mask = cod['shots'] != 0  # Create a mask for non-zero shots
cod.loc[mask, 'Accuracy'] = cod['hits'] / cod['shots']
cod.loc[~mask, 'Accuracy'] = pd.NA  # Set Accuracy to pd.NA for zero shots

# Calculate Headshot Ratio by dividing headshots by kills, handling division by zero
mask = cod['kills'] != 0  # Create a mask for non-zero shots
cod.loc[mask, 'Headshot Ratio'] = cod['headshots'] / cod['kills']
cod.loc[~mask, 'Headshot Ratio'] = pd.NA  # Set Accuracy to pd.NA for zero shots

# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = cod['gamesPlayed'].quantile(0.25)
Q3 = cod['gamesPlayed'].quantile(0.75)

# Calculate the IQR (Interquartile Range)
IQR = Q3 - Q1

# Define the upper and lower bounds to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows where data points are outside the bounds
cod = cod[(cod['gamesPlayed'] >= lower_bound) & (cod['gamesPlayed'] <= upper_bound)]
cod = cod[cod['gamesPlayed'] != 0]

X = cod.drop('wins', axis=1)
y = cod['wins']

# create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=102)

In [3]:
# grid search to tune hyperparameters

pipe = Pipeline([
  ('impute', SimpleImputer()),
  ('poly', PolynomialFeatures(include_bias = True)),
  ('standard', StandardScaler()),
  ('model', KNeighborsRegressor())
])

params = {
  'impute__strategy':('mean','median'), 
  'poly__degree':(1,2,3),
  'model__n_neighbors': list(range(5, 101, 5)),
  'model__weights': ['uniform', 'distance']
}

gs = GridSearchCV(pipe, param_grid = params, scoring = 'neg_mean_squared_error', cv = 10)
gs.fit(X_train, y_train)

# best hyperparameter combinations
best_params = gs.best_params_
print(f'Best parameters: {best_params} \n')

# best MSE
best_mse = -gs.best_score_ 
print(f'Best MSE: {best_mse} \n')

best_model = gs.best_estimator_
preds = best_model.predict(X_test)

# test MSE
test_mse = mean_squared_error(y_test, preds)
print(f'Test MSE: {test_mse}')
print(f'Test RMSE: {np.sqrt(test_mse)}')

Best parameters: {'impute__strategy': 'mean', 'model__n_neighbors': 10, 'model__weights': 'distance', 'poly__degree': 2} 

Best MSE: 6069.731047376114 

Test MSE: 4577.798187477631
Test RMSE: 67.65942792750786


In [4]:
# implement the model with the tuned hyperparameters
pipe = Pipeline([
  ('impute', SimpleImputer(strategy = 'mean')),
  ('standard', StandardScaler()),
  ('model', KNeighborsRegressor(n_neighbors=5, weights='uniform'))
])

# fit pipeline to training data
pipe.fit(X_train, y_train)

# training MSE
train_preds = pipe.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Using KNN, the Train MSE is: {mse_train}")

# test MSE
test_preds = pipe.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Using KNN, the Test MSE is: {mse_test}")
print(f"Using KNN, the Test RMSE is: {np.sqrt(mse_test)} \n")

print(f"The variance of y_test is {np.var(y_test)}")
print(f"The sd of y_test is {np.std(y_test)} \n")

Using KNN, the Train MSE is: 4090.8223076923086
Using KNN, the Test MSE is: 5006.149885057472
Using KNN, the Test RMSE is: 70.75415100937522 

The variance of y_test is 19105.80724005813
The sd of y_test is 138.2237578712796 



This KNN model is overfitting, as the train MSE (8125) is lower than the Test MSE (11609). I will now try some regularization methods to combat overfitting.

In [4]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV

process = Pipeline([
    ('impute', SimpleImputer(strategy = 'mean')),
    ('standard', StandardScaler())
])

X_train = process.fit_transform(X_train)
X_test = process.transform(X_test)

alpha_values = np.logspace(-6, 6, 30)
ridge = RidgeCV(alphas = alpha_values, cv=10)
ridge.fit(X_train, y_train)

#(b) What is the optimal value for alpha?
print(f"Using Ridge, the optimal alpha value is {ridge.alpha_}")

# What is the train MSE?
preds = ridge.predict(X_train)
trainMSE = mean_squared_error(y_train, preds)
print(f"Using Ridge, the Train MSE is {trainMSE}")
print(f"Using Ridge, the Train RMSE is: {np.sqrt(trainMSE)} \n")

#(c) What is the test MSE?
preds = ridge.predict(X_test)
testMSE = mean_squared_error(y_test, preds)
print(f"Using Ridge, the Test MSE is: {testMSE}")
print(f"Using Ridge, the Test RMSE is: {np.sqrt(testMSE)} \n")
coefficients = ridge.coef_
feature_names = X.columns.tolist()

for feature_name, coefficient in zip(feature_names, coefficients):
    print(f"{feature_name}: {coefficient:.4f}")

Using Ridge, the optimal alpha value is 28.072162039411758
Using Ridge, the Train MSE is 3984.2251012928277
Using Ridge, the Train RMSE is: 63.12071847890222 

Using Ridge, the Test MSE is: 3559.019784713429
Using Ridge, the Test RMSE is: 59.65752077243429 

kills: 10.3148
kdRatio: -5.9287
killstreak: 0.3404
level: 3.8123
losses: -1.2889
prestige: -2.1947
hits: -3.0676
timePlayed: 51.3434
headshots: 11.1368
averageTime: -8.1048
gamesPlayed: 39.5710
assists: -18.5048
misses: 13.7022
xp: 7.8497
scorePerMinute: -12.0105
shots: 10.6821
deaths: 16.6452
Accuracy: 1.0303
Headshot Ratio: -0.9177


In [6]:
lasso = LassoCV(cv=10)
lasso.fit(X_train, y_train)

# What is the optimal value for alpha?
print()
print(f"Using Lasso, the optimal alpha value is {lasso.alpha_}")

#(c) What is the train MSE?
preds = lasso.predict(X_train)
trainMSE = mean_squared_error(y_train, preds)
print(f"Using Lasso, the Train MSE is {trainMSE}")
print(f"Using Lasso, the Train RMSE is: {np.sqrt(trainMSE)} \n")

# What is the test MSE?
preds = lasso.predict(X_test)
testMSE = mean_squared_error(y_test, preds)
print(f"Using Lasso, the Test MSE is {testMSE}")
print(f"Using Lasso, the Test RMSE is: {np.sqrt(testMSE)} \n")


Using Lasso, the optimal alpha value is 3.1748294840954134
Using Lasso, the Train MSE is 4108.0082213274345
Using Lasso, the Train RMSE is: 64.09374557105735 

Using Lasso, the Test MSE is 3587.5581582975506
Using Lasso, the Test RMSE is: 59.89622824767475 



In [5]:
# lets try SVM
from sklearn.svm import SVC
from sklearn.svm import SVR
pipe = Pipeline([
  ('impute', SimpleImputer(strategy = 'mean')),
  ('standard', StandardScaler()),
  ('model', SVR())
])

pipe.fit(X_train, y_train)
# training MSE
train_preds = pipe.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

# test MSE
test_preds = pipe.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

print(f"The variance of y_test is {np.var(y_test)} \n")
print(f"The sd of y_test is {np.std(y_test)} \n")

Train MSE: 15923.2902709813 

Test MSE: 14243.800088918537 

Test RMSE: 119.34739246803231 

The variance of y_test is 19105.80724005813 

The sd of y_test is 138.2237578712796 



In [6]:
# grid search to tune hyperparameters

pipe = Pipeline([
  ('impute', SimpleImputer()),
  ('poly', PolynomialFeatures(include_bias = True)),
  ('standard', StandardScaler()),
  ('model', SVR())
])

params = {
    'impute__strategy': ('mean', 'median'),
    'model__C': [0.05, 0.1, 1],  # Regularization parameter
    'model__kernel': ['linear', 'rbf', 'poly'],  # Kernel type
    'model__gamma': ['scale', 'auto']  # Kernel coefficient (for 'rbf' and 'poly')
}

gs = GridSearchCV(pipe, param_grid = params, scoring = 'neg_mean_squared_error', cv = 10)
gs.fit(X_train, y_train)

# best hyperparameter combinations
best_params = gs.best_params_
print(f'Best parameters: {best_params} \n')

# best MSE
best_mse = -gs.best_score_ 
print(f'Best MSE: {best_mse} \n')

best_model = gs.best_estimator_
preds = best_model.predict(X_test)

# test MSE
test_mse = mean_squared_error(y_test, preds)
print(f'Test MSE: {test_mse}')
print(f'Test RMSE: {np.sqrt(test_mse)}')

Best parameters: {'impute__strategy': 'mean', 'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear'} 

Best MSE: 6788.430917656832 

Test MSE: 3588.211598917541
Test RMSE: 59.90168277200183
