# Your Title Here

**Name(s)**: (your name(s) here)

**Website Link**: (your website link)

## Code

In [23]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

### Cleaning and EDA

In [24]:
# Reading the data
fp = r"data\2014_LoL_esports_match_data_from_OraclesElixir.csv"
df = pd.read_csv(fp, index_col=0)
from tqdm.notebook import tqdm
import os

pd.set_option('display.max_columns', None)

directory = r'data'
usecols = ['gameid','datacompleteness', 'league', 'year', 'date', 'patch', 
           'side', 'position', 'playername', 'teamname', 'champion', 'gamelength', 'result',
           'barons', 'opp_barons', 'dragons', 'elders', 'opp_elders']

df = pd.DataFrame()
for filename in tqdm(os.listdir(directory)):
    new_df = pd.read_csv(os.path.join(directory, filename), usecols=usecols)
    df = pd.concat([df, new_df])

  0%|          | 0/10 [00:00<?, ?it/s]

In [25]:
# Data Cleaning
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Remove games after October 2023 (the current league is still ongoing)
time_cutoff = pd.to_datetime('2023-10-01')

df = df[df['date'] < time_cutoff]
df['Win?'] = df['result'].map({0: 'Loss', 1: 'Win'})

# Filter out incomplete data, and fill in NA player names with 'unknown player'
df = df[df['datacompleteness'] == 'complete']
df = df.drop(columns=['datacompleteness'])
df_teams = df.query('position == "team"')
df_teams = df_teams.drop(columns=['gameid','position', 'playername', 'champion'])
df_players = df.query('position != "team"')


In [26]:
print(df_teams.head().to_markdown())

|    | league   |   year | date                |   patch | side   | teamname          |   gamelength |   result |   dragons |   elders |   opp_elders |   barons |   opp_barons | Win?   |
|---:|:---------|-------:|:--------------------|--------:|:-------|:------------------|-------------:|---------:|----------:|---------:|-------------:|---------:|-------------:|:-------|
| 10 | EU LCS   |   2014 | 2014-01-14 17:52:02 |    3.15 | Blue   | Fnatic            |         1924 |        1 |         0 |        0 |            0 |        1 |            0 | Win    |
| 11 | EU LCS   |   2014 | 2014-01-14 17:52:02 |    3.15 | Red    | Gambit Gaming     |         1924 |        0 |         0 |        0 |            0 |        0 |            1 | Loss   |
| 22 | EU LCS   |   2014 | 2014-01-14 19:16:29 |    3.15 | Blue   | Copenhagen Wolves |         2474 |        1 |         0 |        0 |            0 |        1 |            0 | Win    |
| 23 | EU LCS   |   2014 | 2014-01-14 19:16:29 |    3.15 | Red   

In [27]:
df_teams.head().to_markdown('df_teams.md')

In [28]:
(df_teams.assign(missing_dragons=df_teams['dragons'].isna())
 .groupby('league')
 .agg({'missing_dragons': 'mean'})
 .sort_values('missing_dragons')
)

Unnamed: 0_level_0,missing_dragons
league,Unnamed: 1_level_1
AL,0.000000
NASG,0.000000
NACL,0.000000
NA LCS,0.000000
NA CS,0.000000
...,...
EUM,0.000000
IC,0.000000
WLDs,0.000000
OTBLX,0.122222


In [49]:
print(df_teams.head().to_markdown('assets/df_teams.md'))

None


In [50]:
df_teams.head().to_markdown('assets/df_teams.md')

In [51]:
univ = df_teams.groupby('side')['result'].mean()*100
fig = px.pie(univ, values='result', names=univ.index, title='Win Rate by Side', template='plotly_dark')
fig.update_layout(showlegend=False, hovermode='x')
fig.update_traces(textposition='inside', textinfo='percent+label', hovertemplate='%{label} side winrate: %{value:.2f}%')
fig.write_html('assets/univariate.html', include_plotlyjs='cdn')
fig.show()

In [52]:
def convert_minutes(s):
    m = s // 60
    s = s % 60
    return f'{m}:{s:02d}'

In [96]:
# Split the gamelength column into 3 bins
gamelength_bins = pd.cut(df_teams['gamelength'], bins=[900, 1800, 2700, np.inf], labels=['15-30 Minutes', '31-45 Minutes', '45+ Minutes'])

wr_length = (df_teams.assign(gamelength_bins=gamelength_bins)
 .groupby(["gamelength_bins", 'side'])
 .agg({'result': lambda x: np.round(x.mean()*100,2)})
 .unstack()
 .assign(blue_wr_advantage = lambda x: (x['result']['Blue'] - x['result']['Red']))
 ['blue_wr_advantage']
 )

customdata = np.stack([gamelength_bins.value_counts(), gamelength_bins.value_counts(normalize=True)*100], axis=-1)
fig = px.bar(wr_length, title='Blue Side Winrate Advantage by Game Length', template='plotly_dark')
# Change it to categorical
fig.update_xaxes(type='category', title='Game Length')
fig.update_yaxes(title='Blue Side Winrate % Difference')
fig.update_layout(hovermode='x')
# Update the hover label text to show the year, and the winrate advantage
fig.update_traces(customdata=customdata, 
                  hovertemplate='Length: %{x}<br>Blue Winrate Advantage: %{y}%<br>%{customdata[0]:,} Games<br>(%{customdata[1]:.2f}% of all Games)')
# Add a text above each bar with the number of games in that bin, as stored in customdata
fig.update_layout(showlegend=False, annotations=[
    dict(
        x=bin,
        y=advantage,
        text=f"{advantage:.2f}% Higher Winrate",
        showarrow=False,
        font=dict(size=12, color='white'),
        xanchor='center',
        yanchor='bottom'
    )
    for bin, advantage, games in zip(wr_length.index, wr_length.values, customdata[:, 0])
])
fig.write_html('assets/bivariate.html', include_plotlyjs='cdn')
fig.show()
print(wr_length)
wr_length = []

gamelength_bins
15-30 Minutes    10.49
31-45 Minutes     4.34
45+ Minutes       0.72
Name: blue_wr_advantage, dtype: float64


In [120]:
np.minimum(df_teams['gamelength'], np.ones_like(df_teams['gamelength']) * 2700)

10        1924
11        1924
22        2474
23        2474
34        2629
          ... 
122615    2571
122626    1773
122627    1773
122638    1550
122639    1550
Name: gamelength, Length: 120544, dtype: int64

In [122]:
gamelength_capped = np.minimum(df_teams['gamelength'], np.ones_like(df_teams['gamelength']) * 2700)

In [140]:
df_players.columns

Index(['gameid', 'league', 'year', 'date', 'patch', 'side', 'position',
       'playername', 'teamname', 'champion', 'gamelength', 'result', 'dragons',
       'elders', 'opp_elders', 'barons', 'opp_barons', 'Win?'],
      dtype='object')

In [None]:
def most_freq(df):
    out = df.fillna('unknown').value_counts().index[0]
    return out

(df_players.query('playername != "unknown player"')
 .groupby('playername')
 .agg({'gameid': 'count', 'teamname': most_freq, 'position': most_freq, 'champion': most_freq, 'result': lambda x: f"{x.mean()*100:.2f}%"})
 .sort_values("gameid", ascending=False)
 .rename(columns={'gameid': 'Games Played', 'teamname': "Team",'result': "Winrate", 'position': "Main role", 'champion': 'Most frequent champion'})
 .head(20))

In [123]:
(df_teams
 .assign(**{'minutes': gamelength_capped // 60})
 .groupby(['minutes', 'side'])[['result']].mean()
)

Unnamed: 0_level_0,Unnamed: 1_level_0,result
minutes,side,Unnamed: 2_level_1
13,Blue,1.000000
13,Red,0.000000
15,Blue,0.588235
15,Red,0.352941
16,Blue,0.636364
...,...,...
43,Red,0.495674
44,Blue,0.503682
44,Red,0.496318
45,Blue,0.503434


In [54]:
df_obj_by_gl = (df_teams[['side', 'result', 'patch', 'elders', 'barons', 'opp_elders', 'opp_barons']]
 .assign(gamelength_bins=gamelength_bins)
)

df_obj_by_gl['elder advantage'] = df_obj_by_gl['elders'] - df_obj_by_gl['opp_elders']
df_obj_by_gl['baron advantage'] = df_obj_by_gl['barons'] - df_obj_by_gl['opp_barons']

baron_bins = pd.cut(df_obj_by_gl['baron advantage'], bins=[-np.inf, -0.01, 0.99, 1.99, np.inf], labels=['Less Barons', 'Same Barons', '1 More Baron', '2+ More Barons'], right=True)
elder_bins = pd.cut(df_obj_by_gl['elder advantage'], bins=[-np.inf, -0.01, 0.99, 1.99, np.inf], labels=['Less Elders', 'Same Elders', '1 More Elder', '2+ More Elders'], right=True)
df_obj_by_gl['baron_bins'] = baron_bins
df_obj_by_gl['elder_bins'] = elder_bins
df_obj_by_gl = df_obj_by_gl.dropna(axis=0, how='any')
df_obj_by_gl = df_obj_by_gl[['side', 'result', 'patch', 'gamelength_bins', 'baron advantage','baron_bins', 'elder advantage', 'elder_bins']]


(df_obj_by_gl
 .groupby(['gamelength_bins', 'side'])
 [['result']].mean().unstack()
)

Unnamed: 0_level_0,result,result
side,Blue,Red
gamelength_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
15-30 Minutes,0.552209,0.447518
31-45 Minutes,0.521526,0.478474
45+ Minutes,0.501235,0.498765


In [55]:
df_obj_by_gl.sort_values('elder advantage')

Unnamed: 0,side,result,patch,gamelength_bins,baron advantage,baron_bins,elder advantage,elder_bins
1534,Blue,1,8.01,45+ Minutes,-3.0,Less Barons,-4.0,Less Elders
39191,Red,1,6.13,45+ Minutes,3.0,2+ More Barons,-4.0,Less Elders
7078,Blue,0,8.02,45+ Minutes,-4.0,Less Barons,-4.0,Less Elders
16918,Blue,0,7.04,45+ Minutes,-5.0,Less Barons,-4.0,Less Elders
12959,Red,0,7.03,45+ Minutes,-1.0,Less Barons,-3.0,Less Elders
...,...,...,...,...,...,...,...,...
80686,Blue,1,9.15,45+ Minutes,1.0,1 More Baron,3.0,2+ More Elders
16919,Red,1,7.04,45+ Minutes,5.0,2+ More Barons,4.0,2+ More Elders
1535,Red,0,8.01,45+ Minutes,3.0,2+ More Barons,4.0,2+ More Elders
7079,Red,1,8.02,45+ Minutes,4.0,2+ More Barons,4.0,2+ More Elders


In [56]:
df_obj_by_gl['gamelength_bins'].value_counts()

gamelength_bins
31-45 Minutes    70332
15-30 Minutes    43958
45+ Minutes       5670
Name: count, dtype: int64

In [57]:
(df_obj_by_gl.query('result == 1')
 .groupby(['gamelength_bins'])
 [['baron advantage', 'elder advantage']].mean()
 .rename(columns={'baron advantage': 'Winner baron advantage', 'elder advantage': 'Winner elder advantage'})
)

Unnamed: 0_level_0,Winner baron advantage,Winner elder advantage
gamelength_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
15-30 Minutes,0.885268,0.002594
31-45 Minutes,0.974691,0.103822
45+ Minutes,0.672663,0.26455


In [58]:
(df_obj_by_gl
 .groupby(['baron_bins', 'side'])
 [['result']].mean().unstack()
 .rename(columns={'result': 'Winrate'}) * 100
).round(2)

Unnamed: 0_level_0,Winrate,Winrate
side,Blue,Red
baron_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
Less Barons,9.56,7.18
Same Barons,57.86,42.09
1 More Baron,92.81,89.79
2+ More Barons,92.82,91.95


In [59]:
(df_obj_by_gl
 .groupby(['elder_bins', 'side'])
 [['result']].mean().unstack()
  .rename(columns={'result': 'Winrate'}) * 100
).round(2)

Unnamed: 0_level_0,Winrate,Winrate
side,Blue,Red
elder_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
Less Elders,17.97,19.51
Same Elders,53.76,46.23
1 More Elder,80.33,82.01
2+ More Elders,82.53,82.26


### Assessment of Missingness

In [60]:
df_teams.isna().sum()

league               0
year                 0
date                 0
patch              358
side                 0
gamelength           0
result               0
dragons             68
elders              68
opp_elders          68
barons             156
opp_barons         156
Win?                 0
missing_dragons      0
dtype: int64

Unnamed: 0,league,year,date,patch,side,gamelength,result,dragons,elders,opp_elders,barons,opp_barons,Win?,missing_dragons
154,NA LCS,2014,2014-01-17 20:04:22,,Blue,2848,1,0.0,0.0,0.0,3.0,0.0,Win,False
155,NA LCS,2014,2014-01-17 20:04:22,,Red,2848,0,0.0,0.0,0.0,0.0,3.0,Loss,False
166,NA LCS,2014,2014-01-17 21:14:53,,Blue,1815,0,0.0,0.0,0.0,0.0,1.0,Loss,False
167,NA LCS,2014,2014-01-17 21:14:53,,Red,1815,1,0.0,0.0,0.0,1.0,0.0,Win,False
178,NA LCS,2014,2014-01-17 22:12:45,,Blue,2297,1,0.0,0.0,0.0,1.0,1.0,Win,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2267,EU CS,2014,2014-03-12 22:09:11,,Red,3361,0,0.0,0.0,0.0,1.0,0.0,Loss,False
2278,NA CS,2014,2014-03-14 00:45:31,,Blue,2322,0,0.0,0.0,0.0,0.0,2.0,Loss,False
2279,NA CS,2014,2014-03-14 00:45:31,,Red,2322,1,0.0,0.0,0.0,2.0,0.0,Win,False
2290,NA CS,2014,2014-03-14 01:50:19,,Blue,1912,1,0.0,0.0,0.0,1.0,0.0,Win,False


### Hypothesis Testing

In [31]:
df['gameid'].nunique()

60266

0.5320546854260685

In [84]:
# Test statistic: Mean Difference (MD) in Winrate between Blue and Red side 
# = Blue side winrate - Red side winrate = Blue side winrate - (1 - Blue side winrate) = 2 * Blue side winrate - 1 
# Null Hypothesis: MD <= 0: Blue side does not have an advantage over red side
# Alternative Hypothesis: MD > 0: Blue side wins more often than red side

# # of simulations: 100,000
# Significance level: 0.01

number_games = df['gameid'].nunique()
observed_md = (df_teams.groupby('side')['result'].mean()['Blue']) * 2 - 1

random_generations = np.random.multinomial(number_games, [0.5, 0.5], size=100_000)[:, 0] / number_games
random_generations = random_generations * 2 - 1
p_value_overall = (random_generations >= observed_md).mean()

print(f"Observed statistic: {observed_md} \np-value: {p_value_overall}")

# Our p_value is 0.0, which means we reject the null hypothesis that blue side does not have an advantage over red side.
# 


Observed statistic: 0.06410937085213697 
p-value: 0.0


In [90]:
fig = px.histogram(random_generations, title='Empirical Distribution of MD', template='plotly_dark', nbins=20, labels={'value': 'Blue Advantage'})
# Density plot
fig.update_traces(histnorm='probability density')
# Change y-axis to show percentage
fig.update_yaxes(title='Probability %')
# Plot the observed blue advantage as a vertical line
fig.add_vline(x=observed_md, line_width=3, line_color='red', annotation_text=f'Observed Blue Advantage = {observed_md:.3f}   ', annotation_position='top left')
fig.write_html('assets/MD_blue_adv.html', include_plotlyjs='cdn')
fig.show()
