In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
cod = pd.read_csv("cod.csv")
cod = cod.drop('name', axis=1)
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = cod['gamesPlayed'].quantile(0.25)
Q3 = cod['gamesPlayed'].quantile(0.75)

# Calculate the IQR (Interquartile Range)
IQR = Q3 - Q1

# Define the upper and lower bounds to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out rows where data points are outside the bounds
cod = cod[(cod['gamesPlayed'] >= lower_bound) & (cod['gamesPlayed'] <= upper_bound)]
cod = cod[cod['gamesPlayed'] != 0]
print(cod.head())
print(cod.describe())

    wins  kills   kdRatio  killstreak  level  losses  prestige  hits  \
6      4    162  0.632812           4      6       2         0   568   
7    186   1898  0.569628          13     37       7         2  5111   
9     26    349  0.444020           7     12       4         0   996   
12   188   1949  1.495779          22     53       4        57  3333   
14    15    132  0.631579           7      5       4         0   549   

    timePlayed  headshots  averageTime  gamesPlayed  assists  misses      xp  \
6            8         35     2.000000            4       68    4836   24485   
7          550        485     3.666667          150      488   39978  458269   
9           44         40     2.933333           15      138    4844   72765   
12         409        536    16.360000           25      150   10511  909543   
14          21         25     3.500000            6       31    3153   18430   

    scorePerMinute  shots  deaths  
6       265.500000   5404     256  
7       180.37

In [3]:
# Calculate Accuracy by dividing hits by shots, handling division by zero
mask = cod['shots'] != 0
cod.loc[mask, 'Accuracy'] = cod['hits'] / cod['shots']
cod.loc[~mask, 'Accuracy'] = pd.NA

# Calculate Headshot Ratio by dividing headshots by kills, handling division by zero
mask = cod['kills'] != 0
cod.loc[mask, 'Headshot Ratio'] = cod['headshots'] / cod['kills']
cod.loc[~mask, 'Headshot Ratio'] = pd.NA

print(cod.columns)

Index(['wins', 'kills', 'kdRatio', 'killstreak', 'level', 'losses', 'prestige',
       'hits', 'timePlayed', 'headshots', 'averageTime', 'gamesPlayed',
       'assists', 'misses', 'xp', 'scorePerMinute', 'shots', 'deaths',
       'Accuracy', 'Headshot Ratio'],
      dtype='object')


In [4]:
X = cod.drop('wins', axis=1)
y = cod['wins']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=102)

# Define preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ('standard', StandardScaler())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 0.9, 1.0]
}

# Create GridSearchCV pipeline
grid_pipe = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', GradientBoostingRegressor(random_state=102))
])

In [5]:
# Perform GridSearchCV
grid_search = GridSearchCV(grid_pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [6]:
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model on train and test data
train_preds = best_estimator.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

test_preds = best_estimator.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 300, 'model__subsample': 1.0}
Train MSE: 345.8191206478285 

Test MSE: 4262.710326448052 

Test RMSE: 65.28943502932195 



In [16]:
best_params['model__n_estimators'] = 500  # Increase the number of estimators
best_params['model__learning_rate'] = 0.01  # Lower the learning rate
best_params['model__max_depth'] = 5  # Keep the maximum depth
best_params['model__subsample'] = 0.8  # Apply a subsample

# Update the model with the adjusted parameters
best_estimator.set_params(**best_params)

# Evaluate the updated model on train and test data
train_preds = best_estimator.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

test_preds = best_estimator.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

Train MSE: 62221.066749621794 

Test MSE: 50935.39986545725 

Test RMSE: 225.68872339010926 

