In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import csv as pandas dataframe
df = pd.read_csv("high_diamond_ranked_10min.csv")
df

Unnamed: 0,gameId,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,4519157822,0,28,2,1,9,6,11,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,4523371949,0,12,1,0,5,5,5,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,4521474530,0,15,0,0,7,11,4,1,1,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,4524384067,0,43,1,0,4,5,5,1,0,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,4436033771,0,75,4,0,6,6,6,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9874,4527873286,1,17,2,1,7,4,5,1,1,...,0,15246,6.8,16498,229,34,-2519,-2469,22.9,1524.6
9875,4527797466,1,54,0,0,6,4,8,1,1,...,0,15456,7.0,18367,206,56,-782,-888,20.6,1545.6
9876,4527713716,0,23,1,0,6,7,5,0,0,...,0,18319,7.4,19909,261,60,2416,1877,26.1,1831.9
9877,4527628313,0,14,4,1,2,3,3,1,1,...,0,15298,7.2,18314,247,40,839,1085,24.7,1529.8


In [3]:
# columns selected based on correlation calculated in the previous analysis
cols = ['blueTotalGold', 'blueTotalExperience', 'redTotalGold', 'redTotalExperience']
cols

['blueTotalGold', 'blueTotalExperience', 'redTotalGold', 'redTotalExperience']

In [4]:
df_clean = df[cols]
df_clean.head()

Unnamed: 0,blueTotalGold,blueTotalExperience,redTotalGold,redTotalExperience
0,17210,17039,16567,17047
1,14712,16265,17620,17438
2,16113,16221,17285,17254
3,15157,17954,16478,17961
4,16400,18543,17404,18313


In [5]:
# split data
from sklearn.model_selection import train_test_split
y = df["blueWins"].values.reshape(-1, 1)
target_names = ["blueWins", "redWins"]
X = df_clean
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

In [6]:
X_train.shape,X_test.shape,y_train.shape, y_test.shape

((7409, 4), (2470, 4), (7409, 1), (2470, 1))

In [7]:
# train the model
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter = 1000)
logistic_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
print(f"Training Data Score: {logistic_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_model.score(X_test, y_test)}")

Training Data Score: 0.7300580375219328
Testing Data Score: 0.719838056680162


In [9]:
# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
logistic_param_grid = {'C':[1,2,3,4,5,6,7,8,9,10], 'penalty':['l2','none']}
logistic_grid = GridSearchCV(logistic_model, logistic_param_grid)

In [10]:
logistic_grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'penalty': ['l2', 'none']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [11]:
print(logistic_grid.best_params_)
print(logistic_grid.best_score_)

{'C': 1, 'penalty': 'l2'}
0.730058200089118


In [12]:
logistic_predictions = logistic_grid.predict(X_test)

In [13]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, logistic_predictions,
                            target_names = ["blueWins", "redWins"]))

              precision    recall  f1-score   support

    blueWins       0.72      0.72      0.72      1237
     redWins       0.72      0.72      0.72      1233

    accuracy                           0.72      2470
   macro avg       0.72      0.72      0.72      2470
weighted avg       0.72      0.72      0.72      2470



In [14]:
# Save the model for later use.
import pickle
filename = 'LogisticRegression_model.pkl'
pickle.dump(logistic_grid, open(filename, 'wb'))

In [15]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.719838056680162
