In [58]:
# import libraries
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, ElasticNet, ElasticNetCV 
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor

sns.set_style()

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format #suppress scientific notations when using the Describe function

In [59]:
train = pd.read_csv("train.csv")

train = train.iloc[:, 1:] #remove unnamed columns
train

Unnamed: 0,assists,creeps_stacked,deaths,denies,duration,hero_damage,kills,last_hits,levels,name,net,obs_placed,sen_placed,tower_damage,xpm
0,56,10.00,10,50,34.40,93866.00,35,1297,109,Royal Never Give Up,83794.00,17.00,32.00,15696.00,3231
1,24,3.00,35,29,34.40,55124.00,10,910,87,Team Aster,53947.00,17.00,28.00,1414.00,2203
2,77,16.00,28,49,44.33,102842.00,26,968,107,Team Liquid,83236.00,16.00,22.00,11884.00,2295
3,86,14.00,28,21,44.33,100528.00,28,800,105,OG,70296.00,17.00,16.00,11822.00,2168
4,35,18.19,7,25,18.70,81486.76,20,334,57,Team Secret,41901.00,13.50,19.03,9776.29,2064
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,41,4.00,29,22,33.20,66631.00,18,597,81,Fnatic,41416.00,14.00,28.00,960.00,1877
3460,8,17.00,20,23,25.42,44724.00,4,610,66,Royal Never Give Up,40010.00,11.00,14.00,2231.00,1892
3461,38,22.00,4,46,25.42,55940.00,20,768,84,Team Aster,58134.00,13.00,16.00,11663.00,2706
3462,70,21.00,18,35,35.26,74985.00,25,1049,107,Evil Geniuses,80865.00,15.00,25.00,9341.00,2996


## One-hot encoding
---
11 new features were created following one-hot encoding with get_dummies.

In [60]:
train_dummies = pd.get_dummies(train, drop_first = True) #onehotencoding the team_names
train_dummies

Unnamed: 0,assists,creeps_stacked,deaths,denies,duration,hero_damage,kills,last_hits,levels,net,obs_placed,sen_placed,tower_damage,xpm,name_Evil Geniuses,name_Fnatic,name_Nigma Galaxy,name_OG,name_PSG.LGD,name_Royal Never Give Up,name_Team Aster,name_Team Liquid,name_Team Secret,name_Team Spirit,name_Thunder Awaken,name_Tundra Esports
0,56,10.00,10,50,34.40,93866.00,35,1297,109,83794.00,17.00,32.00,15696.00,3231,0,0,0,0,0,1,0,0,0,0,0,0
1,24,3.00,35,29,34.40,55124.00,10,910,87,53947.00,17.00,28.00,1414.00,2203,0,0,0,0,0,0,1,0,0,0,0,0
2,77,16.00,28,49,44.33,102842.00,26,968,107,83236.00,16.00,22.00,11884.00,2295,0,0,0,0,0,0,0,1,0,0,0,0
3,86,14.00,28,21,44.33,100528.00,28,800,105,70296.00,17.00,16.00,11822.00,2168,0,0,0,1,0,0,0,0,0,0,0,0
4,35,18.19,7,25,18.70,81486.76,20,334,57,41901.00,13.50,19.03,9776.29,2064,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3459,41,4.00,29,22,33.20,66631.00,18,597,81,41416.00,14.00,28.00,960.00,1877,0,1,0,0,0,0,0,0,0,0,0,0
3460,8,17.00,20,23,25.42,44724.00,4,610,66,40010.00,11.00,14.00,2231.00,1892,0,0,0,0,0,1,0,0,0,0,0,0
3461,38,22.00,4,46,25.42,55940.00,20,768,84,58134.00,13.00,16.00,11663.00,2706,0,0,0,0,0,0,1,0,0,0,0,0
3462,70,21.00,18,35,35.26,74985.00,25,1049,107,80865.00,15.00,25.00,9341.00,2996,1,0,0,0,0,0,0,0,0,0,0,0


## Train-test-split

In [61]:
x = train_dummies.loc[:, train_dummies.columns != 'xpm']
y = train_dummies[['xpm']]

print(x.shape)
print(y.shape)

# train test split
xtest, xtrain, ytest, ytrain = train_test_split(x, y, test_size = 0.3, random_state = 42)
    
print(xtrain.shape)
print(ytrain.shape)

(3464, 25)
(3464, 1)
(1040, 25)
(1040, 1)


## Scaling Data
---
Pertaining to scaling data, `StandardScaler` was chosen for its standard way to scale data for regularised regressions.

In [62]:
# scale data - standardscaler
scaler = StandardScaler()

xtrain_scaled = scaler.fit_transform(xtrain)
xtest_scaled = scaler.transform(xtest)

pickle.dump(scaler, open('./scaler.bin', 'wb')) #save it in a pickle for prediction based on input later


## Hyperparameter Tuning

### Ridge Alpha

In [63]:
r_alpha = np.logspace (0,5,200)

# fits multiple alphas
ridgecv = RidgeCV(alphas = r_alpha, cv = 5)
ridgecv = ridgecv.fit(xtrain_scaled, ytrain)

print('optimal ridge alpha: ', ridgecv.alpha_)
print('best ridge R2: ', ridgecv.score(xtrain_scaled, ytrain))

optimal ridge alpha:  1.0
best ridge R2:  0.956702883342574


### Lasso Alpha

In [64]:
l_alpha = np.arange(0.001,0.15,0.0025)

# fits multiple alphas
lassocv = LassoCV(alphas = l_alpha, cv = 5)
lassocv = lassocv.fit(xtrain_scaled, ytrain)

print('optimal lasso alpha: ', lassocv.alpha_)
print('best lasso R2: ', lassocv.score(xtrain_scaled, ytrain))

optimal lasso alpha:  0.1485
best lasso R2:  0.9566818226056244


### Elastic Net Lambda and Alpha

In [65]:
enet_alpha = np.arange(0, 1, 0.005)
enet_ratio = [.01, .1, .2, .3, .5, .7, .9, .95, .99, 1]

# fits multiple alphas and rhos
enetcv = ElasticNetCV(alphas = enet_alpha, l1_ratio = enet_ratio, cv = 5)
enetcv = enetcv.fit(xtrain_scaled, ytrain)

print('optimal enet alpha: ', enetcv.alpha_)
print('optimal enet lambda: ', enetcv.l1_ratio_)
print('best elastic net R2: ', enetcv.score(xtrain_scaled, ytrain))

optimal enet alpha:  0.295
optimal enet lambda:  1.0
best elastic net R2:  0.9566279353314161


The following models were tested with cross validation on training data:

* Ordinary linear regression
* Ridge regression
* Lasso regression
* Elastic net regression

Based on the results, the elastic net model showed the best performance with a mean r2 score of 0.95, with the regularised models trailing not too far off.

In [66]:
# instantiate models with previously selected hyperparameters
linmod = LinearRegression()
ridge = Ridge(alpha = ridgecv.alpha_)
lasso = Lasso(alpha = lassocv.alpha_)
enet = ElasticNet(alpha = enetcv.alpha_, l1_ratio = enetcv.l1_ratio_)

In [67]:
# define CV function
nfolds = 5 
np.random.seed(100)

def crossval(model, x, y):
    kf = KFold(nfolds, shuffle = True, random_state = 7)
    rmse = np.sqrt(-cross_val_score(model, x, y, cv = kf, scoring = 'neg_mean_squared_error'))
    r2 = cross_val_score(model, x, y, cv = kf)
    return 'mean CV R2:', r2.mean(), \
            'mean CV RMSE:', rmse.mean(), \
            'CV R2 variance:', r2.var(), \
            'CV RMSE variance:', rmse.var()

In [68]:
# baseline model
# use dummy regressor to predict using mean
dummy_regressor = DummyRegressor()
print('Baseline model: \n', crossval(dummy_regressor, xtrain_scaled, ytrain))

# ordinary linear regression
print('SLR: \n', crossval(linmod, xtrain_scaled, ytrain))

# ridge regression
print('RIDGE: \n', crossval(ridge, xtrain_scaled, ytrain))

# lasso regression
print('LASSO: \n', crossval(lasso, xtrain_scaled, ytrain))

# elastic net regression
print('ELASTIC NET: \n', crossval(enet, xtrain_scaled, ytrain))

Baseline model: 
 ('mean CV R2:', -0.005266404790943424, 'mean CV RMSE:', 557.9014992350312, 'CV R2 variance:', 2.4002602106548374e-05, 'CV RMSE variance:', 735.7258484891729)
SLR: 
 ('mean CV R2:', 0.9529147224696508, 'mean CV RMSE:', 119.7645133282335, 'CV R2 variance:', 7.353105953726703e-05, 'CV RMSE variance:', 45.005913437568566)
RIDGE: 
 ('mean CV R2:', 0.9529620857473731, 'mean CV RMSE:', 119.70473455642009, 'CV R2 variance:', 7.250302452173034e-05, 'CV RMSE variance:', 42.31544056217514)
LASSO: 
 ('mean CV R2:', 0.9529499824051308, 'mean CV RMSE:', 119.71280078935527, 'CV R2 variance:', 7.357295650319343e-05, 'CV RMSE variance:', 44.110555704953185)
ELASTIC NET: 
 ('mean CV R2:', 0.9529658682529606, 'mean CV RMSE:', 119.68904218612538, 'CV R2 variance:', 7.361229456538298e-05, 'CV RMSE variance:', 43.54446401619252)


In [81]:
# --------------------------------------- DEFINE ERROR METRICS ---------------------------------------
def rmse(ytest, ypred):
    return np.sqrt(mean_squared_error(ytest, ypred))

# ----------------------------------------- FIT BASELINE MODEL ----------------------------------------
# use dummy regressor to predict using mean
dummy_regressor = DummyRegressor()
baseline_mod = dummy_regressor.fit(xtrain_scaled, ytrain)
baseline_pred = dummy_regressor.predict(xtest_scaled)

print(f'baseline R2: {dummy_regressor.score(xtest_scaled, ytest)}')
print(f'baseline RMSE: {rmse(ytest, baseline_pred)}')

# ------------------------------------------- FIT SLR MODEL -------------------------------------------

# fit model to train data
linmod = linmod.fit(xtrain_scaled, ytrain)
# predict on test data
ypred = linmod.predict(xtest_scaled)
# evaluate model performance
print('linmod test R2: ', linmod.score(xtest_scaled, ytest))
print('linmod test RMSE: ', rmse(ytest, ypred))

# ------------------------------------------ FIT RIDGE MODEL ------------------------------------------

# fit model to train data
ridge_mod = ridge.fit(xtrain_scaled, ytrain)
# predict on test data
ypred = ridge_mod.predict(xtest_scaled)
# evaluate model performance
print('ridge test R2: ', ridge_mod.score(xtest_scaled, ytest))
print('ridge test RMSE: ', rmse(ytest, ypred))

# ------------------------------------------ FIT LASSO MODEL ------------------------------------------

# fit model to train data
lasso_mod = lasso.fit(xtrain_scaled, ytrain)
# predict on test data
ypred = lasso_mod.predict(xtest_scaled)
# evaluate model performance
print('lasso test R2: ', lasso_mod.score(xtest_scaled, ytest))
print('lasso test RMSE: ', rmse(ytest, ypred))

# --------------------------------------- FIT ELASTIC NET MODEL ---------------------------------------
# fit model to train data
enet_mod = enet.fit(xtrain_scaled, ytrain)
# predict on test data
ypred = enet_mod.predict(xtest_scaled)
# evaluate model performance
print('elastic net test R2: ', enet_mod.score(xtest_scaled, ytest))
print('elastic net test RMSE: ', rmse(ytest, ypred))

baseline R2: -0.0007781525622858609
baseline RMSE: 538.8998500383698
linmod test R2:  0.9508377060967533
linmod test RMSE:  119.44149321982387
ridge test R2:  0.9509025108954104
ridge test RMSE:  119.36274451026176
lasso test R2:  0.9509745030419723
lasso test RMSE:  119.27520100516588
elastic net test R2:  0.9510012516198646
elastic net test RMSE:  119.24265796631788


## Model Evaluation
---
### Summary

|Model           |  R2   | RMSE |
|----------------|-------|------|
|Baseline model  |-0.0078|538.90|
|Ordinary linear | 0.9508|119.44|
|Ridge Regression| 0.9509|119.36|
|Lasso Regression| 0.9509|119.28|
|<b>Elastic Net     |<b>0.9510|<b>119.24|</b>

Next, we will use our best model to predict the XPM for Team Spirit based on user input.

In [83]:
team_name = "Tundra Esports" #enter a team name (e.g. PSG.LGD, Evil Geniuses, Team Liquid)

In [84]:
for each_key, _ in train_dummies.items():
    if team_name in each_key:
        team_name_df = train_dummies[train_dummies[f'name_{team_name}'] == 1]

In [85]:
input_team('Team Spirit')

(1, 25)
(1, 1)


2596

In [86]:
custom_test_df = pd.DataFrame(data = dict(zip(train_dummies.keys(), team_name_df.mean())), index = [0])

In [87]:
x_custom = custom_test_df.loc[:, custom_test_df.columns != 'xpm']
y_custom = custom_test_df[['xpm']]

print(x_custom.shape)
print(y_custom.shape)

(1, 25)
(1, 1)


In [88]:
# load the scaler that was saved as a pickle earlier
scaler = pickle.load(open('./scaler.bin', 'rb'))
x_custom_scaled = scaler.transform(x_custom)

In [102]:
ypred = enet_mod.predict(x_custom_scaled)
int(ypred[0])
int(y_custom.xpm[0])

print("The model predicts {}'s XPM to be {}. Actual XPM is {}." .format(team_name,int(ypred[0]), int(y_custom.xpm[0])))

The model predicts Tundra Esports's XPM to be 2777. Actual XPM is 2775


Based on the above, we can see that the model's prediction is extremely accurate, with a delta of just 2 between the predicted XPM and actual XPM.

# Conclusion and Limitations

The elastic net regression model proved to be the best predictive performance on XPM (experience gained/min), and outperformed the other linear models tested. As a regularised regression method, it was able to reveal which features influenced said feature the most. Features such as `levels`, `kills`, `assists` and `net` are the most important factors affecting the experience gained per minute. 

In the near future, perhaps the idea of parsing stats from live matches could be explored to determine a team's chance at winning based on the current XPM of all the members of the team. 

Limitations of this project are firstly, survivorship bias. As the matches spanned between 2012 and 2022, the success that the previous teams have achieved in the former year may not be a good representation of the games played currently, or to be played in the future. Secondly, due to time constraints for this project, fields/columns such as `items` were not explored, which could have also played a part in affecting XPM. Lastly, due to the COVID-19 pandemic, teams were stuck playing in their own region for a year, which if were not the case then would have provided more invaluable data, and in which case other indicators of a winning team can be considered.

In reality however, Dota is a game that can swing in anyone's favor - a slip-up from the leading team could result in a swift comeback, and is also dependent on the player's psyche, performance and psychological factors in any match. That is not even taking into account the possibility of match-fixing. Therefore this may never be the perfect model as these information may never be found in a dataset, however the aim of this model is to provide a guideline to our stakeholders to make an informed decision in finding the ideal team to sponsor.

