In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# For preprocessing and splitting data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# For evaluating models
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score

# For deep learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow import keras

# xgboost
import xgboost as xgb

In [34]:
ranks = pd.read_csv('LeagueRanking.csv')
ranks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 66 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Position         470 non-null    object
 1   Region           470 non-null    object
 2   Player           470 non-null    object
 3   Total            471 non-null    object
 4   International    471 non-null    object
 5   Unnamed: 5       471 non-null    object
 6   Regional - East  471 non-null    object
 7   Unnamed: 7       471 non-null    object
 8   Unnamed: 8       471 non-null    object
 9   Unnamed: 9       471 non-null    object
 10  Regional - West  471 non-null    object
 11  Unnamed: 11      471 non-null    object
 12  Unnamed: 12      471 non-null    object
 13  Unnamed: 13      471 non-null    object
 14  LCK              471 non-null    object
 15  Unnamed: 15      471 non-null    object
 16  Unnamed: 16      471 non-null    object
 17  Unnamed: 17      471 non-null    ob

In [35]:
df = pd.read_csv('OEFULL.csv', low_memory=False)
df

Unnamed: 0,index,gameid,datacompleteness,url,league,year,split,playoffs,date,game,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
0,862222,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,...,760.0,4854.0,4919.0,-21.0,10.0,23.0,4.0,4.0,6.0,10.0
1,862214,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
2,862212,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,...,206.0,76.0,-512.0,-18.0,3.0,4.0,0.0,1.0,2.0,2.0
3,862213,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
4,862216,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872023,292715,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,...,870.0,2295.0,7637.0,-8.0,9.0,23.0,4.0,4.0,9.0,9.0
872024,292706,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,...,214.0,-716.0,-2609.0,-14.0,1.0,2.0,3.0,1.0,6.0,0.0
872025,292711,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,...,200.0,716.0,2609.0,14.0,1.0,6.0,0.0,1.0,2.0,3.0
872026,292712,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,...,264.0,-445.0,1274.0,-28.0,1.0,5.0,0.0,1.0,2.0,0.0


In [36]:
df = df.drop(columns=['index'])
df

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
0,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,760.0,4854.0,4919.0,-21.0,10.0,23.0,4.0,4.0,6.0,10.0
1,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
2,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,206.0,76.0,-512.0,-18.0,3.0,4.0,0.0,1.0,2.0,2.0
3,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
4,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
872023,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,14.18,...,870.0,2295.0,7637.0,-8.0,9.0,23.0,4.0,4.0,9.0,9.0
872024,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,14.18,...,214.0,-716.0,-2609.0,-14.0,1.0,2.0,3.0,1.0,6.0,0.0
872025,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,14.18,...,200.0,716.0,2609.0,14.0,1.0,6.0,0.0,1.0,2.0,3.0
872026,LOLTMNT02_181861,complete,,HW,2024,,0,2024-11-03 21:08:36,5.0,14.18,...,264.0,-445.0,1274.0,-28.0,1.0,5.0,0.0,1.0,2.0,0.0


In [37]:
ranks = ranks[['Player', 'Total']]
ranks = ranks.dropna()
ranks

Unnamed: 0,Player,Total
1,Faker,1121
2,Knight,593
3,Chovy,572
4,Peanut,542
5,Keria,500
...,...,...
466,KiWiKiD,1
467,Keane,1
468,Hakuho,1
469,Elementz,1


In [38]:
df = df[df['playername'].isin(ranks['Player'])]
df


Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
1,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
3,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
4,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0
5,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,188.0,-76.0,512.0,18.0,1.0,2.0,2.0,3.0,4.0,0.0
6,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,161.0,3265.0,1950.0,50.0,6.0,2.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871869,LOLTMNT03_161676,complete,,LAS,2024,Champ 2,0,2024-11-03 05:56:58,2.0,14.21,...,27.0,1031.0,2095.0,12.0,0.0,10.0,3.0,0.0,6.0,2.0
871883,LOLTMNT03_162490,complete,,LAS,2024,Champ 2,0,2024-11-03 05:59:29,2.0,14.21,...,33.0,271.0,-232.0,-9.0,0.0,7.0,3.0,0.0,7.0,1.0
871928,LOLTMNT03_162500,complete,,LAS,2024,Champ 2,0,2024-11-03 07:08:41,1.0,14.21,...,25.0,927.0,2373.0,16.0,2.0,4.0,2.0,2.0,3.0,3.0
871953,LOLTMNT03_161683,complete,,LAS,2024,Champ 2,0,2024-11-03 07:51:03,2.0,14.21,...,43.0,-116.0,-681.0,-13.0,1.0,7.0,1.0,1.0,5.0,2.0


In [39]:
df = df[~df['participantid'].isin([100, 200])]
df

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
1,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
3,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
4,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0
5,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,188.0,-76.0,512.0,18.0,1.0,2.0,2.0,3.0,4.0,0.0
6,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,161.0,3265.0,1950.0,50.0,6.0,2.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871869,LOLTMNT03_161676,complete,,LAS,2024,Champ 2,0,2024-11-03 05:56:58,2.0,14.21,...,27.0,1031.0,2095.0,12.0,0.0,10.0,3.0,0.0,6.0,2.0
871883,LOLTMNT03_162490,complete,,LAS,2024,Champ 2,0,2024-11-03 05:59:29,2.0,14.21,...,33.0,271.0,-232.0,-9.0,0.0,7.0,3.0,0.0,7.0,1.0
871928,LOLTMNT03_162500,complete,,LAS,2024,Champ 2,0,2024-11-03 07:08:41,1.0,14.21,...,25.0,927.0,2373.0,16.0,2.0,4.0,2.0,2.0,3.0,3.0
871953,LOLTMNT03_161683,complete,,LAS,2024,Champ 2,0,2024-11-03 07:51:03,2.0,14.21,...,43.0,-116.0,-681.0,-13.0,1.0,7.0,1.0,1.0,5.0,2.0


In [40]:
# Remove columns with more than 50% NaN values
df = df.dropna(thresh=len(df) * 0.5, axis=1)
df

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
1,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,225.0,621.0,733.0,8.0,1.0,5.0,1.0,1.0,2.0,0.0
3,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,140.0,-888.0,351.0,-42.0,0.0,5.0,3.0,2.0,1.0,1.0
4,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,28.0,1780.0,2397.0,-19.0,0.0,7.0,0.0,0.0,1.0,3.0
5,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,188.0,-76.0,512.0,18.0,1.0,2.0,2.0,3.0,4.0,0.0
6,TRLH3/33,complete,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,2014-01-14 17:52:02,1.0,3.15,...,161.0,3265.0,1950.0,50.0,6.0,2.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871869,LOLTMNT03_161676,complete,,LAS,2024,Champ 2,0,2024-11-03 05:56:58,2.0,14.21,...,27.0,1031.0,2095.0,12.0,0.0,10.0,3.0,0.0,6.0,2.0
871883,LOLTMNT03_162490,complete,,LAS,2024,Champ 2,0,2024-11-03 05:59:29,2.0,14.21,...,33.0,271.0,-232.0,-9.0,0.0,7.0,3.0,0.0,7.0,1.0
871928,LOLTMNT03_162500,complete,,LAS,2024,Champ 2,0,2024-11-03 07:08:41,1.0,14.21,...,25.0,927.0,2373.0,16.0,2.0,4.0,2.0,2.0,3.0,3.0
871953,LOLTMNT03_161683,complete,,LAS,2024,Champ 2,0,2024-11-03 07:51:03,2.0,14.21,...,43.0,-116.0,-681.0,-13.0,1.0,7.0,1.0,1.0,5.0,2.0


We now have a df that has all the players in the ranking spreadsheet. Let us now clean it up. First, let's try to drop all of the @minute metrics, as they are quite redundant when we have the full game metrics, and would likely cause extra noise in the model

In [41]:
columns_with_at = [col for col in df.columns if 'at' in col]
print(columns_with_at)

['datacompleteness', 'date', 'patch', 'deaths', 'teamdeaths', 'damagemitigatedperminute', 'goldat10', 'xpat10', 'csat10', 'opp_goldat10', 'opp_xpat10', 'opp_csat10', 'golddiffat10', 'xpdiffat10', 'csdiffat10', 'killsat10', 'assistsat10', 'deathsat10', 'opp_killsat10', 'opp_assistsat10', 'opp_deathsat10', 'goldat15', 'xpat15', 'csat15', 'opp_goldat15', 'opp_xpat15', 'opp_csat15', 'golddiffat15', 'xpdiffat15', 'csdiffat15', 'killsat15', 'assistsat15', 'deathsat15', 'opp_killsat15', 'opp_assistsat15', 'opp_deathsat15', 'goldat20', 'xpat20', 'csat20', 'opp_goldat20', 'opp_xpat20', 'opp_csat20', 'golddiffat20', 'xpdiffat20', 'csdiffat20', 'killsat20', 'assistsat20', 'deathsat20', 'opp_killsat20', 'opp_assistsat20', 'opp_deathsat20', 'goldat25', 'xpat25', 'csat25', 'opp_goldat25', 'opp_xpat25', 'opp_csat25', 'golddiffat25', 'xpdiffat25', 'csdiffat25', 'killsat25', 'assistsat25', 'deathsat25', 'opp_killsat25', 'opp_assistsat25', 'opp_deathsat25']


In [42]:
columns_with_at.remove('patch')
columns_with_at.remove('deaths')
columns_with_at.remove('teamdeaths')
columns_with_at.remove('damagemitigatedperminute')
print(columns_with_at)

['datacompleteness', 'date', 'goldat10', 'xpat10', 'csat10', 'opp_goldat10', 'opp_xpat10', 'opp_csat10', 'golddiffat10', 'xpdiffat10', 'csdiffat10', 'killsat10', 'assistsat10', 'deathsat10', 'opp_killsat10', 'opp_assistsat10', 'opp_deathsat10', 'goldat15', 'xpat15', 'csat15', 'opp_goldat15', 'opp_xpat15', 'opp_csat15', 'golddiffat15', 'xpdiffat15', 'csdiffat15', 'killsat15', 'assistsat15', 'deathsat15', 'opp_killsat15', 'opp_assistsat15', 'opp_deathsat15', 'goldat20', 'xpat20', 'csat20', 'opp_goldat20', 'opp_xpat20', 'opp_csat20', 'golddiffat20', 'xpdiffat20', 'csdiffat20', 'killsat20', 'assistsat20', 'deathsat20', 'opp_killsat20', 'opp_assistsat20', 'opp_deathsat20', 'goldat25', 'xpat25', 'csat25', 'opp_goldat25', 'opp_xpat25', 'opp_csat25', 'golddiffat25', 'xpdiffat25', 'csdiffat25', 'killsat25', 'assistsat25', 'deathsat25', 'opp_killsat25', 'opp_assistsat25', 'opp_deathsat25']


In [43]:
df = df.drop(columns=columns_with_at)
df

Unnamed: 0,gameid,url,league,year,split,playoffs,game,patch,participantid,side,...,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,monsterkillsownjungle,monsterkillsenemyjungle,cspm
1,TRLH3/33,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,1.0,3.15,3,Blue,...,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,26.0,3.0,8.6071
3,TRLH3/33,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,1.0,3.15,2,Blue,...,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,83.0,8.0,3.5863
4,TRLH3/33,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,1.0,3.15,5,Blue,...,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.0,0.0,0.3430
5,TRLH3/33,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,1.0,3.15,6,Red,...,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,13.0,6.0,7.2349
6,TRLH3/33,http://matchhistory.na.leagueoflegends.com/en/...,EU LCS,2014,Spring,0,1.0,3.15,4,Blue,...,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,28.0,4.0,8.7942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871869,LOLTMNT03_161676,,LAS,2024,Champ 2,0,2.0,14.21,10,Red,...,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,,,1.3714
871883,LOLTMNT03_162490,,LAS,2024,Champ 2,0,2.0,14.21,10,Red,...,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,,,1.2527
871928,LOLTMNT03_162500,,LAS,2024,Champ 2,0,1.0,14.21,10,Red,...,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,,,1.2090
871953,LOLTMNT03_161683,,LAS,2024,Champ 2,0,2.0,14.21,10,Red,...,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,,,1.1884


In [44]:
cols = df.columns
for col in cols:
    print(col)

gameid
url
league
year
split
playoffs
game
patch
participantid
side
position
playername
playerid
teamname
teamid
champion
ban1
ban2
ban3
ban4
ban5
gamelength
result
kills
deaths
assists
teamkills
teamdeaths
doublekills
triplekills
quadrakills
pentakills
firstblood
firstbloodkill
firstbloodassist
firstbloodvictim
team kpm
ckpm
inhibitors
opp_inhibitors
damagetochampions
dpm
damageshare
damagetakenperminute
damagemitigatedperminute
wardsplaced
wpm
wardskilled
wcpm
controlwardsbought
visionscore
vspm
totalgold
earnedgold
earned gpm
earnedgoldshare
goldspent
total cs
minionkills
monsterkills
monsterkillsownjungle
monsterkillsenemyjungle
cspm


In [45]:
threshold = len(df) * 0.9
df = df.dropna(thresh=threshold, axis=1)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 128605 entries, 1 to 871960
Data columns (total 55 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gameid                    128605 non-null  object 
 1   league                    128605 non-null  object 
 2   year                      128605 non-null  int64  
 3   playoffs                  128605 non-null  int64  
 4   game                      128605 non-null  float64
 5   patch                     127558 non-null  float64
 6   participantid             128605 non-null  int64  
 7   side                      128605 non-null  object 
 8   position                  128605 non-null  object 
 9   playername                128605 non-null  object 
 10  playerid                  128508 non-null  object 
 11  teamname                  128605 non-null  object 
 12  teamid                    128499 non-null  object 
 13  champion                  128605 non-null  object

In [47]:
df = df.dropna()
df

Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,playername,...,vspm,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,xPeke,...,0.0000,14564.0,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,Cyanide,...,0.0000,10113.0,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,YellOwStaR,...,0.0000,9045.0,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430
5,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,Darien,...,0.0000,10050.0,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349
6,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,Rekkles,...,0.0000,14210.0,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871869,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,Cloud,...,3.8026,8870.0,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714
871883,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,5.0318,11708.0,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527
871928,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,Cloud,...,4.5082,11993.0,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090
871953,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,5.6210,7882.0,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884


In [48]:
df = df.merge(
    ranks[['Player', 'Total']], 
    left_on='playername', 
    right_on='Player', 
)
df



Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,playername,...,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm,Player,Total
0,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,xPeke,...,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071,xPeke,75
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,Cyanide,...,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863,Cyanide,73
2,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,YellOwStaR,...,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430,YellOwStaR,157
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,Darien,...,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349,Darien,6
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,Rekkles,...,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942,Rekkles,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,Cloud,...,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714,Cloud,18
119729,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527,Neo,1
119730,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,Cloud,...,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090,Cloud,18
119731,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884,Neo,1


In [49]:
df = df.drop(columns=['Player'])
df

Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,playername,...,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm,Total
0,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,xPeke,...,14564.0,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071,75
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,Cyanide,...,10113.0,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863,73
2,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,YellOwStaR,...,9045.0,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430,157
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,Darien,...,10050.0,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349,6
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,Rekkles,...,14210.0,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,Cloud,...,8870.0,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714,18
119729,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,11708.0,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527,1
119730,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,Cloud,...,11993.0,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090,18
119731,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,7882.0,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884,1


In [50]:
df['Total'] = df['Total'].astype(int)
df

Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,playername,...,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm,Total
0,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,xPeke,...,14564.0,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071,75
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,Cyanide,...,10113.0,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863,73
2,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,YellOwStaR,...,9045.0,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430,157
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,Darien,...,10050.0,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349,6
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,Rekkles,...,14210.0,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,Cloud,...,8870.0,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714,18
119729,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,11708.0,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527,1
119730,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,Cloud,...,11993.0,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090,18
119731,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,7882.0,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884,1


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 56 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gameid                    119733 non-null  object 
 1   league                    119733 non-null  object 
 2   year                      119733 non-null  int64  
 3   playoffs                  119733 non-null  int64  
 4   game                      119733 non-null  float64
 5   patch                     119733 non-null  float64
 6   participantid             119733 non-null  int64  
 7   side                      119733 non-null  object 
 8   position                  119733 non-null  object 
 9   playername                119733 non-null  object 
 10  playerid                  119733 non-null  object 
 11  teamname                  119733 non-null  object 
 12  teamid                    119733 non-null  object 
 13  champion                  119733 non-null  o

Most of the remaining columns are descriptive features of a player, but there are many that are highly noisy, and thus not worth keeping. Any features related to draft, so bans, any firstblood related metrics are too rare to be useful, and all multi-kills will already be factored by the kills metric.

In [52]:
noisy_cols = [
    'ban1',
    'ban2',
    'ban3',
    'doublekills',
    'triplekills',
    'quadrakills',
    'pentakills',
    'firstblood',
    'firstbloodkill',
    'firstbloodassist',
    'firstbloodvictim'
]

In [53]:
df_s = df.drop(columns=noisy_cols)
df_s

Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,playername,...,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm,Total
0,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,xPeke,...,14564.0,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071,75
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,Cyanide,...,10113.0,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863,73
2,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,YellOwStaR,...,9045.0,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430,157
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,Darien,...,10050.0,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349,6
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,Rekkles,...,14210.0,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,Cloud,...,8870.0,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714,18
119729,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,11708.0,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527,1
119730,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,Cloud,...,11993.0,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090,18
119731,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,Neo,...,7882.0,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884,1


In [54]:
df_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 45 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gameid                    119733 non-null  object 
 1   league                    119733 non-null  object 
 2   year                      119733 non-null  int64  
 3   playoffs                  119733 non-null  int64  
 4   game                      119733 non-null  float64
 5   patch                     119733 non-null  float64
 6   participantid             119733 non-null  int64  
 7   side                      119733 non-null  object 
 8   position                  119733 non-null  object 
 9   playername                119733 non-null  object 
 10  playerid                  119733 non-null  object 
 11  teamname                  119733 non-null  object 
 12  teamid                    119733 non-null  object 
 13  champion                  119733 non-null  o

Let's create df X which will train on all of the useful features in our current dataframe with the target being the player rating

In [63]:
X = df_s.drop(columns=['playername', 'playerid', 'teamname', 'teamid', 'champion'])
X

Unnamed: 0,gameid,league,year,playoffs,game,patch,participantid,side,position,gamelength,...,totalgold,earnedgold,earned gpm,earnedgoldshare,goldspent,total cs,minionkills,monsterkills,cspm,Total
0,TRLH3/33,EU LCS,2014,0,1.0,3.15,3,Blue,mid,1924,...,14564.0,10604.0,330.6861,0.267565,8750.0,276.0,247.0,29.0,8.6071,75
1,TRLH3/33,EU LCS,2014,0,1.0,3.15,2,Blue,jng,1924,...,10113.0,6153.0,191.8815,0.155260,6780.0,115.0,24.0,91.0,3.5863,73
2,TRLH3/33,EU LCS,2014,0,1.0,3.15,5,Blue,sup,1924,...,9045.0,5085.0,158.5759,0.128312,6390.0,11.0,11.0,0.0,0.3430,157
3,TRLH3/33,EU LCS,2014,0,1.0,3.15,6,Red,top,1924,...,10050.0,6090.0,189.9168,0.239525,7920.0,232.0,213.0,19.0,7.2349,6
4,TRLH3/33,EU LCS,2014,0,1.0,3.15,4,Blue,bot,1924,...,14210.0,10250.0,319.6466,0.258633,9260.0,282.0,250.0,32.0,8.7942,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,LAS,2024,0,2.0,14.21,10,Red,sup,1925,...,8870.0,4667.0,145.4649,0.111322,7225.0,44.0,44.0,0.0,1.3714,18
119729,LOLTMNT03_162490,LAS,2024,0,2.0,14.21,10,Red,sup,2826,...,11708.0,5667.0,120.3185,0.098267,10850.0,59.0,55.0,4.0,1.2527,1
119730,LOLTMNT03_162500,LAS,2024,0,1.0,14.21,10,Red,sup,2928,...,11993.0,5744.0,117.7049,0.106231,11575.0,59.0,59.0,0.0,1.2090,18
119731,LOLTMNT03_161683,LAS,2024,0,2.0,14.21,10,Red,sup,1868,...,7882.0,3796.0,121.9272,0.119937,6750.0,37.0,37.0,0.0,1.1884,1


In [64]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 40 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gameid                    119733 non-null  object 
 1   league                    119733 non-null  object 
 2   year                      119733 non-null  int64  
 3   playoffs                  119733 non-null  int64  
 4   game                      119733 non-null  float64
 5   patch                     119733 non-null  float64
 6   participantid             119733 non-null  int64  
 7   side                      119733 non-null  object 
 8   position                  119733 non-null  object 
 9   gamelength                119733 non-null  int64  
 10  result                    119733 non-null  int64  
 11  kills                     119733 non-null  int64  
 12  deaths                    119733 non-null  int64  
 13  assists                   119733 non-null  i

In [65]:
X = X.drop(columns='league')
X = pd.get_dummies(X, columns=['side', 'position'], drop_first=True)
X

Unnamed: 0,gameid,year,playoffs,game,patch,participantid,gamelength,result,kills,deaths,...,total cs,minionkills,monsterkills,cspm,Total,side_Red,position_jng,position_mid,position_sup,position_top
0,TRLH3/33,2014,0,1.0,3.15,3,1924,1,10,1,...,276.0,247.0,29.0,8.6071,75,False,False,True,False,False
1,TRLH3/33,2014,0,1.0,3.15,2,1924,1,0,4,...,115.0,24.0,91.0,3.5863,73,False,True,False,False,False
2,TRLH3/33,2014,0,1.0,3.15,5,1924,1,0,0,...,11.0,11.0,0.0,0.3430,157,False,False,False,True,False
3,TRLH3/33,2014,0,1.0,3.15,6,1924,0,1,5,...,232.0,213.0,19.0,7.2349,6,True,False,False,False,True
4,TRLH3/33,2014,0,1.0,3.15,4,1924,1,8,0,...,282.0,250.0,32.0,8.7942,289,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,2024,0,2.0,14.21,10,1925,1,1,4,...,44.0,44.0,0.0,1.3714,18,True,False,False,True,False
119729,LOLTMNT03_162490,2024,0,2.0,14.21,10,2826,0,0,7,...,59.0,55.0,4.0,1.2527,1,True,False,False,True,False
119730,LOLTMNT03_162500,2024,0,1.0,14.21,10,2928,0,2,6,...,59.0,59.0,0.0,1.2090,18,True,False,False,True,False
119731,LOLTMNT03_161683,2024,0,2.0,14.21,10,1868,0,1,3,...,37.0,37.0,0.0,1.1884,1,True,False,False,True,False


In [66]:
# Move the column 'Total' to the back
X = X[[col for col in X.columns if col != 'Total'] + ['Total']]
X


Unnamed: 0,gameid,year,playoffs,game,patch,participantid,gamelength,result,kills,deaths,...,total cs,minionkills,monsterkills,cspm,side_Red,position_jng,position_mid,position_sup,position_top,Total
0,TRLH3/33,2014,0,1.0,3.15,3,1924,1,10,1,...,276.0,247.0,29.0,8.6071,False,False,True,False,False,75
1,TRLH3/33,2014,0,1.0,3.15,2,1924,1,0,4,...,115.0,24.0,91.0,3.5863,False,True,False,False,False,73
2,TRLH3/33,2014,0,1.0,3.15,5,1924,1,0,0,...,11.0,11.0,0.0,0.3430,False,False,False,True,False,157
3,TRLH3/33,2014,0,1.0,3.15,6,1924,0,1,5,...,232.0,213.0,19.0,7.2349,True,False,False,False,True,6
4,TRLH3/33,2014,0,1.0,3.15,4,1924,1,8,0,...,282.0,250.0,32.0,8.7942,False,False,False,False,False,289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119728,LOLTMNT03_161676,2024,0,2.0,14.21,10,1925,1,1,4,...,44.0,44.0,0.0,1.3714,True,False,False,True,False,18
119729,LOLTMNT03_162490,2024,0,2.0,14.21,10,2826,0,0,7,...,59.0,55.0,4.0,1.2527,True,False,False,True,False,1
119730,LOLTMNT03_162500,2024,0,1.0,14.21,10,2928,0,2,6,...,59.0,59.0,0.0,1.2090,True,False,False,True,False,18
119731,LOLTMNT03_161683,2024,0,2.0,14.21,10,1868,0,1,3,...,37.0,37.0,0.0,1.1884,True,False,False,True,False,1


In [67]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119733 entries, 0 to 119732
Data columns (total 42 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   gameid                    119733 non-null  object 
 1   year                      119733 non-null  int64  
 2   playoffs                  119733 non-null  int64  
 3   game                      119733 non-null  float64
 4   patch                     119733 non-null  float64
 5   participantid             119733 non-null  int64  
 6   gamelength                119733 non-null  int64  
 7   result                    119733 non-null  int64  
 8   kills                     119733 non-null  int64  
 9   deaths                    119733 non-null  int64  
 10  assists                   119733 non-null  int64  
 11  teamkills                 119733 non-null  int64  
 12  teamdeaths                119733 non-null  int64  
 13  team kpm                  119733 non-null  f

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Assuming 'X' is your DataFrame with features and 'Total' is the target
# Prepare features (X) and target (y)
y = df_s['Total']  # Set target as 'Total'
X = X.drop(columns=['Total', 'gameid'])  # Exclude target ('Total') and game identifier ('gameid')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
model = XGBRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("R²:", r2)


KeyError: "['Total'] not in index"