# Your Title Here

**Name(s)**: (your name(s) here)

**Website Link**: (your website link)

## Code

In [2]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

### Cleaning and EDA

In [3]:
# Reading the data
fp = r"data\2014_LoL_esports_match_data_from_OraclesElixir.csv"
df = pd.read_csv(fp, index_col=0)
from tqdm.notebook import tqdm
import os

pd.set_option('display.max_columns', None)

directory = r'data'
usecols = ['gameid','datacompleteness', 'league', 'year', 'date', 'patch', 
           'side', 'position', 'playername', 'teamname', 'champion', 'gamelength', 'result',
           'barons', 'opp_barons', 'dragons', 'elders', 'opp_elders']

df = pd.DataFrame()
for filename in tqdm(os.listdir(directory)):
    new_df = pd.read_csv(os.path.join(directory, filename), usecols=usecols)
    df = pd.concat([df, new_df])

  0%|          | 0/10 [00:00<?, ?it/s]

In [4]:
# Data Cleaning
# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Remove games after October 2023 (the current league is still ongoing)
time_cutoff = pd.to_datetime('2023-10-01')

df = df[df['date'] < time_cutoff]
df['Win?'] = df['result'].map({0: 'Loss', 1: 'Win'})

# Filter out incomplete data, and fill in NA player names with 'unknown player'
df = df[df['datacompleteness'] == 'complete']
df = df.drop(columns=['datacompleteness'])
df_teams = df.query('position == "team"')
df_teams = df_teams.drop(columns=['gameid','position', 'playername', 'champion'])
df_players = df.query('position != "team"')


In [5]:
print(df_teams.head().to_markdown())

|    | league   |   year | date                |   patch | side   | teamname          |   gamelength |   result |   dragons |   elders |   opp_elders |   barons |   opp_barons | Win?   |
|---:|:---------|-------:|:--------------------|--------:|:-------|:------------------|-------------:|---------:|----------:|---------:|-------------:|---------:|-------------:|:-------|
| 10 | EU LCS   |   2014 | 2014-01-14 17:52:02 |    3.15 | Blue   | Fnatic            |         1924 |        1 |         0 |        0 |            0 |        1 |            0 | Win    |
| 11 | EU LCS   |   2014 | 2014-01-14 17:52:02 |    3.15 | Red    | Gambit Gaming     |         1924 |        0 |         0 |        0 |            0 |        0 |            1 | Loss   |
| 22 | EU LCS   |   2014 | 2014-01-14 19:16:29 |    3.15 | Blue   | Copenhagen Wolves |         2474 |        1 |         0 |        0 |            0 |        1 |            0 | Win    |
| 23 | EU LCS   |   2014 | 2014-01-14 19:16:29 |    3.15 | Red   

In [10]:
univ = df_teams.groupby('side')['result'].mean()*100
fig = px.pie(univ, values='result', names=univ.index, title='Win Rate by Side', template='plotly_dark')
fig.update_layout(showlegend=False, hovermode='x')
fig.update_traces(textposition='inside', textinfo='percent+label', hovertemplate='%{label} side winrate: %{value:.2f}%')
fig.write_html('assets/univariate.html', include_plotlyjs='cdn')
fig.show()

In [11]:
def convert_minutes(s):
    m = s // 60
    s = s % 60
    return f'{m}:{s:02d}'

In [12]:
# Split the gamelength column into 3 bins
gamelength_bins = pd.cut(df_teams['gamelength'], bins=[900, 1800, 2700, np.inf], labels=['15-30 Minutes', '31-45 Minutes', '45+ Minutes'])

wr_length = (df_teams.assign(gamelength_bins=gamelength_bins)
 .groupby(["gamelength_bins", 'side'])
 .agg({'result': lambda x: np.round(x.mean()*100,2)})
 .unstack()
 .assign(blue_wr_advantage = lambda x: (x['result']['Blue'] - x['result']['Red']))
 ['blue_wr_advantage']
 )

customdata = np.stack([gamelength_bins.value_counts(), gamelength_bins.value_counts(normalize=True)*100], axis=-1)
fig = px.bar(wr_length, title='Blue Side Winrate Advantage by Game Length', template='plotly_dark')

# Change x-axis to categorical
fig.update_xaxes(type='category', title='Game Length')
fig.update_yaxes(title='Blue Side Winrate % Difference')
fig.update_layout(hovermode='x')

# Update the hover label text to show the year, and the winrate advantage
fig.update_traces(customdata=customdata, 
                  hovertemplate='Length: %{x}<br>Blue Winrate Advantage: %{y}%<br>%{customdata[0]:,} Games<br>(%{customdata[1]:.2f}% of all Games)')

# Add a text above each bar with the number of games in that bin, as stored in customdata
fig.update_layout(showlegend=False, annotations=[
    dict(
        x=bin,
        y=advantage,
        text=f"{advantage:.2f}% Higher Winrate",
        showarrow=False,
        font=dict(size=12, color='white'),
        xanchor='center',
        yanchor='bottom'
    )
    for bin, advantage, games in zip(wr_length.index, wr_length.values, customdata[:, 0])
])

fig.write_html('assets/bivariate.html', include_plotlyjs='cdn')
fig.show()
print(wr_length)
wr_length = []

gamelength_bins
15-30 Minutes    10.49
31-45 Minutes     4.34
45+ Minutes       0.72
Name: blue_wr_advantage, dtype: float64


In [18]:
df_obj_by_gl = (df_teams[['side', 'result', 'patch', 'elders', 'barons', 'opp_elders', 'opp_barons']]
 .assign(gamelength_bins=gamelength_bins)
)

df_obj_by_gl['elder advantage'] = df_obj_by_gl['elders'] - df_obj_by_gl['opp_elders']
df_obj_by_gl['baron advantage'] = df_obj_by_gl['barons'] - df_obj_by_gl['opp_barons']

baron_bins = pd.cut(df_obj_by_gl['baron advantage'], bins=[-np.inf, -0.01, 0.99, 1.99, np.inf], labels=['Less Barons', 'Same Barons', '1 More Baron', '2+ More Barons'], right=True)
elder_bins = pd.cut(df_obj_by_gl['elder advantage'], bins=[-np.inf, -0.01, 0.99, 1.99, np.inf], labels=['Less Elders', 'Same Elders', '1 More Elder', '2+ More Elders'], right=True)
df_obj_by_gl['baron_bins'] = baron_bins
df_obj_by_gl['elder_bins'] = elder_bins
df_obj_by_gl = df_obj_by_gl.dropna(axis=0, how='any')
df_obj_by_gl = df_obj_by_gl[['side', 'result', 'patch', 'gamelength_bins', 'baron advantage','baron_bins', 'elder advantage', 'elder_bins']]


(df_obj_by_gl
 .groupby(['gamelength_bins', 'side'])
 [['result']].mean().unstack()
)

Unnamed: 0_level_0,result,result
side,Blue,Red
gamelength_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
15-30 Minutes,0.552209,0.447518
31-45 Minutes,0.521526,0.478474
45+ Minutes,0.501235,0.498765


In [19]:
df_obj_by_gl.sort_values('elder advantage')

Unnamed: 0,side,result,patch,gamelength_bins,baron advantage,baron_bins,elder advantage,elder_bins
1534,Blue,1,8.01,45+ Minutes,-3.0,Less Barons,-4.0,Less Elders
39191,Red,1,6.13,45+ Minutes,3.0,2+ More Barons,-4.0,Less Elders
7078,Blue,0,8.02,45+ Minutes,-4.0,Less Barons,-4.0,Less Elders
16918,Blue,0,7.04,45+ Minutes,-5.0,Less Barons,-4.0,Less Elders
12959,Red,0,7.03,45+ Minutes,-1.0,Less Barons,-3.0,Less Elders
...,...,...,...,...,...,...,...,...
80686,Blue,1,9.15,45+ Minutes,1.0,1 More Baron,3.0,2+ More Elders
16919,Red,1,7.04,45+ Minutes,5.0,2+ More Barons,4.0,2+ More Elders
1535,Red,0,8.01,45+ Minutes,3.0,2+ More Barons,4.0,2+ More Elders
7079,Red,1,8.02,45+ Minutes,4.0,2+ More Barons,4.0,2+ More Elders


In [20]:
df_obj_by_gl['gamelength_bins'].value_counts()

gamelength_bins
31-45 Minutes    70332
15-30 Minutes    43958
45+ Minutes       5670
Name: count, dtype: int64

In [21]:
# Elder advantage by game length
(df_obj_by_gl.query('result == 1')
 .groupby(['gamelength_bins'])
 [['baron advantage', 'elder advantage']].mean()
 .rename(columns={'baron advantage': 'Winner baron advantage', 'elder advantage': 'Winner elder advantage'})
)

Unnamed: 0_level_0,Winner baron advantage,Winner elder advantage
gamelength_bins,Unnamed: 1_level_1,Unnamed: 2_level_1
15-30 Minutes,0.885268,0.002594
31-45 Minutes,0.974691,0.103822
45+ Minutes,0.672663,0.26455


In [22]:
(df_obj_by_gl
 .groupby(['baron_bins', 'side'])
 [['result']].mean().unstack()
 .rename(columns={'result': 'Winrate'}) * 100
).round(2)

Unnamed: 0_level_0,Winrate,Winrate
side,Blue,Red
baron_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
Less Barons,9.56,7.18
Same Barons,57.86,42.09
1 More Baron,92.81,89.79
2+ More Barons,92.82,91.95


In [23]:
(df_obj_by_gl
 .groupby(['elder_bins', 'side'])
 [['result']].mean().unstack()
  .rename(columns={'result': 'Winrate'}) * 100
).round(2)

Unnamed: 0_level_0,Winrate,Winrate
side,Blue,Red
elder_bins,Unnamed: 1_level_2,Unnamed: 2_level_2
Less Elders,17.97,19.51
Same Elders,53.76,46.23
1 More Elder,80.33,82.01
2+ More Elders,82.53,82.26


### Assessment of Missingness

In [111]:
df_players['player_missing'] = df_players['playername'].isna()

# Missingness test for playerid by league
n_repetitions = 500
shuffled = df_players.copy()[['league', 'player_missing']]

new = (df_players.pivot_table(index='league', columns='player_missing', aggfunc='size').apply(lambda x: x / x.sum())).fillna(0)
observed_tvd_league = new.diff(axis=1).iloc[:, -1].abs().sum() / 2

tvds_league = []

for i in tqdm(range(n_repetitions)):
    shuffled['league'] = np.random.permutation(shuffled['league'])
    
    pivoted = (
        shuffled
        .pivot_table(index='league', columns='player_missing', aggfunc='size')
        .apply(lambda x: x / x.sum())
    )

    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds_league.append(tvd)

p_val_league = np.mean(np.array(tvds_league) >= observed_tvd_league)
print(f'p-value for league: {p_val_league}\nobserved tvd: {observed_tvd_league}')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



  0%|          | 0/500 [00:00<?, ?it/s]

0.0

In [139]:
fig = px.histogram(tvds_league, title='Empirical Distribution of TVD (By League)', template='plotly_dark', nbins=20)

# Density plot
fig.update_traces(histnorm='probability')

# Change y-axis to show percentage
fig.update_yaxes(title='Probability')
fig.update_layout(showlegend=False)

# Plot the observed difference as a vertical line
fig.add_vline(x=observed_tvd_league, line_width=3, line_color='red', 
              annotation_text=f'Observed Rift Difference = {observed_tvd_league:.3f}   ', 
              annotation_position='left')

fig.write_html('assets/missingness_league.html', include_plotlyjs='cdn')
fig.show()


In [155]:
# Missingness test for playerid by position
shuffled = df_players.copy()[['side', 'player_missing']]
new = (df_players.pivot_table(index='side', columns='player_missing', aggfunc='size').apply(lambda x: x / x.sum())).fillna(0)
observed_tvd_side = new.diff(axis=1).iloc[:, -1].abs().sum() / 2
tvds_side = []

for i in tqdm(range(n_repetitions)):
    shuffled['side'] = np.random.permutation(shuffled['side'])
    
    pivoted = (
        shuffled
        .pivot_table(index='side', columns='player_missing', aggfunc='size')
        .apply(lambda x: x / x.sum())
    )

    tvd = pivoted.diff(axis=1).iloc[:, -1].abs().sum() / 2
    tvds_side.append(tvd)

p_val_side = np.mean(np.array(tvds_side) >= observed_tvd_side)
print(f'p-value for league: {p_val_side}\nobserved tvd: {observed_tvd_side}')

  0%|          | 0/500 [00:00<?, ?it/s]

p-value for league: 0.716
observed tvd: 0.1250016591672308


In [157]:
fig = px.histogram(tvds_side, title='Empirical Distribution of TVD (By Side)', template='plotly_dark', nbins=20)

# Density plot
fig.update_traces(histnorm='probability')

# Change y-axis to show percentage
fig.update_yaxes(title='Probability')
fig.update_layout(showlegend=False)

# Plot the observed difference as a vertical line
fig.add_vline(x=observed_tvd_side, line_width=3, line_color='red', 
              annotation_text=f'Observed Difference = {observed_tvd_side:.3f}   ', 
              annotation_position='left')

fig.write_html('assets/missingness_position.html', include_plotlyjs='cdn')
fig.show()


### Hypothesis Testing

In [25]:
# Test statistic: Mean Difference (MD) in Winrate between Blue and Red side 
# = Blue side winrate - Red side winrate = Blue side winrate - (1 - Blue side winrate) = 2 * Blue side winrate - 1 
# Null Hypothesis: MD <= 0: Blue side does not have an advantage over red side
# Alternative Hypothesis: MD > 0: Blue side wins more often than red side

# # of simulations: 100,000
# Significance level: 0.01

number_games = df['gameid'].nunique()
observed_md = (df_teams.groupby('side')['result'].mean()['Blue']) * 2 - 1

random_generations = np.random.multinomial(number_games, [0.5, 0.5], size=100_000)[:, 0] / number_games
random_generations = random_generations * 2 - 1
p_value_overall = (random_generations >= observed_md).mean()

print(f"Observed statistic: {observed_md} \np-value: {p_value_overall}")



Observed statistic: 0.06410937085213697 
p-value: 0.0


In [132]:
fig = px.histogram(random_generations, title='Empirical Distribution of MD (Advantage)', template='plotly_dark', nbins=20, labels={'value': 'Blue Advantage'})

# Density plot
fig.update_traces(histnorm='probability')

# Change y-axis to show percentage
fig.update_yaxes(title='Probability')

# Plot the observed blue advantage as a vertical line
fig.add_vline(x=observed_md, line_width=3, line_color='red', annotation_text=f'Observed Blue Advantage = {observed_md:.3f}   ', annotation_position='left')
fig.update_layout(showlegend=False)
fig.write_html('assets/MD_blue_adv.html', include_plotlyjs='cdn')
fig.show()


### Hypothesis test 2:
#### Rift Herald Win Rate

In [63]:
# For this question, we are looking at if the introduction of the rift herald in patch 5.22 lowered the blue side winrate.
# Which is the reason why it was added to the game, we will check if it succeded in this goal
# Null hypothesis: the blue side winrate distribution is the same before and after the rift herald was introduced
# Alternative hypothesis: the blue side winrate is lower after the rift herald was introduced

rift_patch = 5.22
only_results = df[df.get('champion').isna() == True][['patch', 'side', 'result']]
only_results_blue = only_results[only_results.get('side') == 'Blue']
new = only_results_blue.copy()
new['is_before_rift_herald'] = new['patch'].apply(lambda x: x < rift_patch)

g = new.groupby(['is_before_rift_herald', 'side'])['result'].mean()
bluewr_b4_rift, bluewr_after_rift = g[1], g[0]
perms = []
obs_rift_diff = bluewr_after_rift - bluewr_b4_rift
obs_rift_diff

-0.027001964909418752

In [68]:
for i in tqdm(range(10000)):
    # Shuffle the is_before_rift_herald column
    with_shuffled = new.assign(Shuffled_Weights=np.random.permutation(new['is_before_rift_herald']))

    # Find the difference in winrate between the two groups
    # Since we have filtered it to only blue side, we only need to look at the blue side winrate
    # to infer the red side winrate
    group_means = with_shuffled.groupby(['Shuffled_Weights'])['result'].mean()
    
    # Append the difference in winrate to the list of permutations
    difference = group_means[False] - group_means[True]
    perms.append(difference)

p_value_rift = (perms <= obs_rift_diff).mean()


#The P value is 0.0031 which is less than our significance threshold of 0.01, so we can reject the null hypothesis. 
#This means that the win rate distribution is different before and after the rift herald was introduced, 
#suggesting that the rift herald did indeed have an effect on the win rate between the two sides.

print(p_value_rift)

  0%|          | 0/10000 [00:00<?, ?it/s]

0.0047


In [133]:
fig = px.histogram(perms, title='Empirical Distribution of MD (Rift)', template='plotly_dark', nbins=20, labels={'value': 'Blue Advantage'})

# Density plot
fig.update_traces(histnorm='probability')

# Change y-axis to show percentage
fig.update_yaxes(title='Probability')
fig.update_layout(showlegend=False)

# Plot the observed difference as a vertical line
fig.add_vline(x=obs_rift_diff, line_width=3, line_color='red', annotation_text=f'Observed Rift Difference = {obs_rift_diff:.3f}   ', annotation_position='top left')
fig.write_html('assets/MD_rift_diff.html', include_plotlyjs='cdn')
fig.show()
