# League of Legends Predictor

**Name(s)**: Jawad Najar and Ali Boussi

**Website Link**: (your website link)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

pd.options.plotting.backend = 'plotly'
pd.set_option('display.max_columns', None)

## Step 1: Introduction

###

In [None]:
raw_data = pd.read_csv('data.csv')
raw_data.head()

In [None]:
raw_data.shape

In [None]:
raw_data['gameid'].nunique()

## Step 2: Data Cleaning and Exploratory Data Analysis

#### Data Cleaning

In [None]:
df = raw_data.copy()

# filter for complete data
df = df[df['datacompleteness'] == 'complete']

# drop unnecessary columns (metadata)
meta_columns = ['participantid', 'playerid', 'teamid', 
                'datacompleteness', 'url', 'date',
                'teamname', 'playername']
df = df.drop(columns=meta_columns)

# seperate team and player stats
team_df = df[df['position'] == 'team']
player_df = df[df['position'] != 'team']

# Dropping all columnsn that are all NaN (it is a team/player stat, respectively)
player_df = player_df.dropna(axis=1, how='all')
team_df = team_df.dropna(axis=1, how='all')

# Team Adv columns 
team_df['tower_advantage'] = team_df['towers'] - team_df['opp_towers']
team_df['inhibitor_advantage'] = team_df['inhibitors'] - team_df['opp_inhibitors']
team_df['gamelengthmin'] = team_df['gamelength'] / 60

In [None]:
# Visualizing data before imputation (if we decide to do it )
# (only the winning games, otherwise the data will always be symmetric)

# Visualize the distribution of gold difference columns
gold_diff_columns = ['golddiffat10', 'golddiffat15', 'golddiffat20']
for col in gold_diff_columns:
    fig = px.histogram(
        team_df[team_df['result'] == 1],
        x=col,
        nbins=20,
        title=f'Distribution of {col}',
        labels={col: 'Gold Difference'},
        template='plotly_white',
        color_discrete_sequence=['gold']
    )
    fig.update_layout(
        bargap=0.1,
        xaxis_title='Gold Difference',
        yaxis_title='Frequency',
        font=dict(size=14)
    )
    fig.show()

# Visualize the distribution of XP difference columns
xp_diff_columns = ['xpdiffat10', 'xpdiffat15', 'xpdiffat20']
for col in xp_diff_columns:
    fig = px.histogram(
        team_df[team_df['result'] == 1],
        x=col,
        nbins=20,
        title=f'Distribution of {col}',
        labels={col: 'XP Difference'},
        template='plotly_white',
        color_discrete_sequence=['blue']
    )
    fig.update_layout(
        bargap=0.1,
        xaxis_title='XP Difference',
        yaxis_title='Frequency',
        font=dict(size=14)
    )
    fig.show()

In [None]:
# Deciding against imputation here, as NaN values represent that a game has ended prior to that time
# and we want to keep that information

# Creating 3 features that we will plot against one another in the future
player_df['avg_xpdiff'] = player_df[xp_diff_columns].mean(axis=1, skipna=True)
player_df['avg_golddiff'] = player_df[gold_diff_columns].mean(axis=1, skipna=True)
player_df['game_length_min'] = player_df['gamelength'] / 60

In [None]:
fig = px.histogram(
    team_df,
    x='tower_advantage',
    nbins=15,  
    title='Distribution of Tower Advantage',
    labels={'tower_advantage': 'Tower Advantage (Team - Opponent)'},
    color_discrete_sequence=['indianred'] 
)

fig.update_layout(
    bargap=0.1, 
    template='plotly_white',  
    xaxis_title='Tower Advantage',
    yaxis_title='Number of Games',
    font=dict(size=14)
)

fig.show()

In [None]:
fig = px.histogram(
    team_df,
    x='inhibitor_advantage',
    nbins=15, 
    title='Distribution of Inhibitor Advantage',
    labels={'inhibitor_advantage': 'Inhibitor Advantage (Team - Opponent)'},
    color_discrete_sequence=['skyblue'] 
)

fig.update_layout(
    bargap=0.1, 
    template='plotly_white', 
    xaxis_title='Inhibitor Advantage',
    yaxis_title='Number of Games',
    font=dict(size=14)
)

fig.show()

In [None]:
# Creating a copy df
gold_stats = team_df.copy()[['gameid', 'earnedgold', 'side', 'result']]

gold_stats['red_win'] = ((gold_stats['result'] == 1) & ((gold_stats['side'] == 'Red')))
red_side_gold = gold_stats.loc[gold_stats['side'] == 'Red', ['gameid', 'earnedgold', 'red_win']]
blue_side_gold =  gold_stats.loc[gold_stats['side'] == 'Blue', ['gameid', 'earnedgold']]
gold_final = red_side_gold.merge(blue_side_gold, on='gameid', suffixes=('_red', '_blue'))

fig = px.scatter(
    gold_final,
    x='earnedgold_red',
    y='earnedgold_blue',
    color='red_win',
    color_discrete_map={True: '#D2042D', False: '#0047AB'},
    opacity=0.6,
    hover_data={
        'earnedgold_red': ':.0f',
        'earnedgold_blue': ':.0f',
        'red_win': True
    },
    labels={
        'earnedgold_red': 'Red Team Gold',
        'earnedgold_blue': 'Blue Team Gold',
        'red_win': 'Red Team Won'
    },
    title='Red Gold Earned vs Blue Gold Earned by Game Outcome'
)

fig.update_traces(marker=dict(size=6))

fig.update_layout(
    template='plotly_white', 
    xaxis=dict(title='Red Team Earned Gold', tickformat=',', title_font_size=16),
    yaxis=dict(title='Blue Team Earned Gold', tickformat=',', title_font_size=16),
    legend_title_text='Red Team Win?',
    legend=dict(font=dict(size=12))
)

fig.show()

In [None]:
# Here, we will sample 10% of the games
wins = player_df[player_df['result'] == 1]
all_games = wins['gameid'].drop_duplicates()
sampled_games = all_games.sample(frac=0.1, random_state=98)
sampled_df = wins[wins['gameid'].isin(sampled_games)]

In [None]:
fig = px.scatter(
    sampled_df,
    x='game_length_min',
    y='avg_xpdiff',
    color='position',
    title='Game Length (minutes) vs Average XP Difference by position (Winning Games)',
    labels={'avg_xpdiff': 'Average XP Difference', 'game_length_min': 'Game Length (minutes)'},
    template='plotly_white',
    hover_data=['position']
)

fig.update_layout(
    xaxis_title='Game Length (minutes)',
    yaxis_title='Average XP Difference',
    font=dict(size=14)
)

fig.show()

In [None]:
fig = px.scatter(
    sampled_df,
    x='game_length_min',
    y='avg_golddiff',
    color='position',
    title='Game Length (minutes) vs Average Gold Difference by position (Winning Games)',
    labels={'game_length_min': 'Game Length (minutes)', 'avg_golddiff': 'Average Gold Difference'},
    template='plotly_white',
    hover_data=['position']
)

fig.update_layout(
    xaxis_title='Game Length (minutes)',
    yaxis_title='Average Gold Difference',
    font=dict(size=14)
)

fig.show()

In [None]:
# Correlation of Gold and XP Difference with Win by Position
gold_corr = player_df.groupby('position').apply(lambda g: g['avg_golddiff'].corr(g['result']))
xp_corr = player_df.groupby('position').apply(lambda g: g['avg_xpdiff'].corr(g['result']))

imp = (pd
       .DataFrame({'gold': gold_corr, 'xp': xp_corr})
       .reset_index()
       .melt(id_vars='position', var_name='metric', value_name='corr'))

fig = px.bar(
    imp,
    x='position',
    y='corr',
    color='metric',
    barmode='group',
    title='Correlation of Gold/XP Difference with Win by Position',
    labels={'corr':'Correlation with Win','position':'Position','metric':'Metric'},
    template='plotly_white'
)
fig.update_layout(font=dict(size=14))
fig.show()

In [None]:
sampled_games = team_df.sample(frac=0.1, random_state=98)
fig = px.scatter(
    sampled_games,
    x='totalgold',
    y='gamelengthmin',
    title='Total Gold vs Game Length (minutes)',
    labels={'totalgold': 'Total Gold', 'gamelengthmin': 'Game Length (minutes)'},
    template='plotly_white',
    hover_data=['position']
)

fig.update_layout(
    xaxis_title='Total Gold',
    yaxis_title='Game Length (minutes)',
    font=dict(size=14)
)

In [None]:
# Creating a pivot table for win rates by champion and position
# Example of how this reads: "Aatrox has a win rate of .535 in the top lane, not played in any other lane"
win_rates = player_df.groupby(['champion','position'])['result'] \
                    .agg(win_rate='mean', games='count') \
                    .reset_index()

pd.pivot_table(
    win_rates[win_rates['games'] >= 30],
    index='champion',
    columns='position',
    values='win_rate'
).sort_values(by='champion', ascending=True)

## Step 3: Framing a Prediction Problem

<p> When a team is down by, say, >5k gold at 20 minutes, predict the probability they still win. </p>

## Step 4: Baseline Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
features = [
    'totalgold',
    'earnedgold',
    'minionkills',
    'team kpm',
    'ckpm',
    'cspm',
    'deaths',
    'earned gpm',
    'visionscore',
    'monsterkills',
    'damagetochampions',
    'teamkills',
    'vspm',
    'wardsplaced',
    'dpm',
    'goldspent',
    'wpm'
]
X = team_df[features]
y = team_df['gamelength']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
len(features)

In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))

## Step 5: Final Model

In [59]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, QuantileTransformer, FunctionTransformer
from sklearn.linear_model import Lasso
from sklearn.compose import make_column_transformer

In [None]:
# Loop through each feature and plot its distribution to see which are non-normal
for feature in features:
    fig = px.histogram(
        team_df, 
        x=feature, 
        nbins=30, 
        title=f'Distribution of {feature}', 
        labels={feature: feature.capitalize()},
        template='plotly_white'
    )
    fig.update_layout(
        xaxis_title=feature.capitalize(),
        yaxis_title='Frequency',
        font=dict(size=14)
    )
    fig.show()

In [70]:
# The columns that we will end up using a Quantile Transformer on
non_normal_features = [
    "team kpm",         # Right-skewed
    "ckpm",             # Right-skewed
    "deaths",           # Left-skewed
    "earned gpm",       # Bimodal
    "damagetochampions",# Right-skewed
    "teamkills"         # Left-skewed
]

col_trans = make_column_transformer(
    (QuantileTransformer(output_distribution='normal'), non_normal_features),
    remainder='passthrough'
)

svr_pipeline = make_pipeline(
    col_trans,
    StandardScaler(),
    Lasso(alpha=0.1, random_state=42),
)


In [None]:
# Hyperparameter tuning
param_grid = {
    "lasso__alpha":      np.logspace(-3, 1, 20),
    "lasso__selection":  ["cyclic", "random"],
    "lasso__fit_intercept": [True, False],
    "lasso__max_iter":   [10_000, 100_000],
    "lasso__tol":        [1e-4, 1e-5],
    "lasso__positive":   [False, True],
}

grid = GridSearchCV(svr_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(X_train, y_train)
print("Best alpha:", grid.best_params_)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [None]:
# Finding the most influential features
y_pred = grid.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))

best_pipeline = grid.best_estimator_
coefficients = best_pipeline.named_steps['lasso'].coef_
intercept = best_pipeline.named_steps['lasso'].intercept_
feature_names = X_train.columns

coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
}).sort_values(by='coefficient', key=abs, ascending=False)

print(coef_df)


MSE: 0.05503596331011578
              feature  coefficient
6              deaths  1119.544629
7          earned gpm  -843.063351
8         visionscore     1.469283
9        monsterkills    -0.878420
10  damagetochampions     0.441720
11          teamkills     0.441409
4                ckpm     0.411317
15          goldspent     0.314599
14                dpm    -0.276253
12               vspm    -0.248528
1          earnedgold    -0.236624
13        wardsplaced     0.205633
2         minionkills     0.143612
5                cspm     0.124889
16                wpm    -0.114684
3            team kpm    -0.053335
0           totalgold    -0.030688
