In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
cod = pd.read_csv("cod.csv")
cod = cod.drop('name', axis=1)
print(cod.head())
print(cod.describe())

   wins  kills  kdRatio  killstreak  level  losses  prestige  hits  \
0     0      0  0.00000           0      1       0         0     0   
1     0      0  0.00000           0      1       0       110     0   
2     0     66  1.03125           0      9       0       110     0   
3     3      2  0.40000           0      1       0         0     0   
4     0      2  0.20000           0      1       0       110     0   

   timePlayed  headshots  averageTime  gamesPlayed  assists  misses     xp  \
0           0          0          0.0            0        0       0      0   
1           7          0          7.0            0        0       0    700   
2          32         16         32.0            0        1       0  48300   
3           3          0          3.0            0        0       0   1150   
4           5          1          5.0            0        0       0   1000   

   scorePerMinute  shots  deaths  
0             0.0      0       0  
1             0.0      0      16  
2    

In [5]:
# Calculate Accuracy by dividing hits by shots, handling division by zero
mask = cod['shots'] != 0
cod.loc[mask, 'Accuracy'] = cod['hits'] / cod['shots']
cod.loc[~mask, 'Accuracy'] = pd.NA

# Calculate Headshot Ratio by dividing headshots by kills, handling division by zero
mask = cod['kills'] != 0
cod.loc[mask, 'Headshot Ratio'] = cod['headshots'] / cod['kills']
cod.loc[~mask, 'Headshot Ratio'] = pd.NA

print(cod.columns)

Index(['wins', 'kills', 'kdRatio', 'killstreak', 'level', 'losses', 'prestige',
       'hits', 'timePlayed', 'headshots', 'averageTime', 'gamesPlayed',
       'assists', 'misses', 'xp', 'scorePerMinute', 'shots', 'deaths',
       'Accuracy', 'Headshot Ratio'],
      dtype='object')


In [7]:
X = cod.drop('wins', axis=1)
y = cod['wins']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=102)

# Define preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=1, include_bias=False)),
    ('standard', StandardScaler())
])

# Define the parameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 0.9, 1.0]
}

# Create GridSearchCV pipeline
grid_pipe = Pipeline([
    ('preprocess', preprocessing_pipeline),
    ('model', GradientBoostingRegressor(random_state=102))
])

In [8]:
# Perform GridSearchCV
grid_search = GridSearchCV(grid_pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [9]:
# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model on train and test data
train_preds = best_estimator.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

test_preds = best_estimator.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

Best Parameters: {'model__learning_rate': 0.05, 'model__max_depth': 5, 'model__n_estimators': 100, 'model__subsample': 1.0}
Train MSE: 315.6292906742566 

Test MSE: 6809.872708995792 

Test RMSE: 82.52195289131124 



In [16]:
best_params['model__n_estimators'] = 500  # Increase the number of estimators
best_params['model__learning_rate'] = 0.01  # Lower the learning rate
best_params['model__max_depth'] = 5  # Keep the maximum depth
best_params['model__subsample'] = 0.8  # Apply a subsample

# Update the model with the adjusted parameters
best_estimator.set_params(**best_params)

# Evaluate the updated model on train and test data
train_preds = best_estimator.predict(X_train)
mse_train = mean_squared_error(y_train, train_preds)
print(f"Train MSE: {mse_train} \n")

test_preds = best_estimator.predict(X_test)
mse_test = mean_squared_error(y_test, test_preds)
print(f"Test MSE: {mse_test} \n")
print(f"Test RMSE: {np.sqrt(mse_test)} \n")

Train MSE: 62221.066749621794 

Test MSE: 50935.39986545725 

Test RMSE: 225.68872339010926 

