In [256]:
import pandas as pd
import glob

path = "LoLesports_data"

all_files = glob.glob(path + "/*.csv")

df_list = []

for file in all_files[-5:]:
    temp_df = pd.read_csv(file, low_memory=False)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
df = df[df["datacompleteness"] == "complete"]
df.drop(columns=["datacompleteness", "url"], inplace=True)

print(df.head())
print(len(df))

                  gameid league  year split  playoffs                 date  \
0  ESPORTSTMNT03/1241318  KeSPA  2020   NaN         0  2020-01-03 07:33:26   
1  ESPORTSTMNT03/1241318  KeSPA  2020   NaN         0  2020-01-03 07:33:26   
2  ESPORTSTMNT03/1241318  KeSPA  2020   NaN         0  2020-01-03 07:33:26   
3  ESPORTSTMNT03/1241318  KeSPA  2020   NaN         0  2020-01-03 07:33:26   
4  ESPORTSTMNT03/1241318  KeSPA  2020   NaN         0  2020-01-03 07:33:26   

   game  patch  participantid  side  ... opp_csat25 golddiffat25 xpdiffat25  \
0   1.0   9.24              1  Blue  ...        NaN          NaN        NaN   
1   1.0   9.24              2  Blue  ...        NaN          NaN        NaN   
2   1.0   9.24              3  Blue  ...        NaN          NaN        NaN   
3   1.0   9.24              4  Blue  ...        NaN          NaN        NaN   
4   1.0   9.24              5  Blue  ...        NaN          NaN        NaN   

  csdiffat25 killsat25 assistsat25 deathsat25 opp_killsa

In [257]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 538944 entries, 0 to 658271
Columns: 159 entries, gameid to opp_deathsat25
dtypes: float64(127), int64(11), object(21)
memory usage: 657.9+ MB


In [258]:
teams = df[df["position"] == "team"]
players = df[df["position"] != "team"]

- 팀 데이터 전처리

In [259]:
drop_teams_cols = [
    "playername",
    "playerid",
    "position",
    "champion",
    "firstbloodkill",
    "firstbloodassist",
    "firstbloodvictim",
    "damageshare",
    "earnedgoldshare",
    "total cs",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
]

fill_zero_cols = [
    "elementaldrakes",
    "opp_elementaldrakes",
    "chemtechs",
    "hextechs",
    "dragons (type unknown)",
    "void_grubs",
    "opp_void_grubs",
    "turretplates",
    "opp_turretplates",
    "quadrakills",
    "pentakills",
    "heralds",
    "opp_heralds",
    "firstblood",
    "firstdragon",
    "dragons",
    "opp_dragons",
    "infernals",
    "mountains",
    "clouds",
    "oceans",
    "elders",
    "opp_elders",
    "firstherald",
    "firstbaron",
    "barons",
    "opp_barons",
    "firsttower",
    "firstmidtower",
    "inhibitors",
    "opp_inhibitors",
]

fill_unknown_cols = [
    "gameid",
    "game",
    "teamname",
    "teamid",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "split",
]

tmp = teams.drop(columns=drop_teams_cols)
tmp[fill_zero_cols] = tmp[fill_zero_cols].fillna(0)
tmp[fill_unknown_cols] = tmp[fill_unknown_cols].fillna("unknown")

In [260]:
missing_columns = tmp.columns[tmp.isnull().sum() > 0]
means = teams.groupby("teamid")[missing_columns].mean()

for col in missing_columns:
    total_mean = teams[col].mean()
    
    tmp[col] = tmp.apply(
        lambda row: (
            means.loc[row['teamid'], col] 
            if row['teamid'] in means.index and not pd.isnull(means.loc[row['teamid'], col])
            else total_mean
        ) if pd.isnull(row[col]) else row[col], 
        axis=1
    )

In [261]:
tmp.isnull().sum().sum()

0

In [262]:
teams = tmp
teams.shape

(89824, 117)

- 선수 데이터 전처리

In [263]:
players.shape

(449120, 159)

In [264]:
drop_players_cols = [
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "firstdragon",
    "dragons",
    "opp_dragons",
    "elementaldrakes",
    "opp_elementaldrakes",
    "infernals",
    "mountains",
    "clouds",
    "oceans",
    "chemtechs",
    "hextechs",
    "elders",
    "opp_elders",
    "firstherald",
    "heralds",
    "opp_heralds",
    "void_grubs",
    "opp_void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "opp_towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "opp_turretplates",
    "gspd",
    "gpr",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
]

fill_zero_cols = [
    "dragons (type unknown)",
    "barons",
    "opp_barons",
    "quadrakills",
    "pentakills",
    "firstblood",
    "firstbloodkill",
    "firstbloodassist",
    "firstbloodvictim",
    "inhibitors",
    "opp_inhibitors",
]

fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
    "gameid",
    "game",
    "playername",
    "playerid",
    "teamname",
    "teamid",
]

tmp = players.drop(columns=drop_players_cols)
tmp[fill_zero_cols] = tmp[fill_zero_cols].fillna(0)
tmp[fill_unknown_cols] = tmp[fill_unknown_cols].fillna("unknown")

In [265]:
missing_columns = tmp.columns[tmp.isnull().sum() > 0]
means = players.groupby("playerid")[missing_columns].mean()

for col in missing_columns:
    total_mean = players[col].mean()
    
    tmp[col] = tmp.apply(
        lambda row: (
            means.loc[row['playerid'], col]
            if row['playerid'] in means.index and not pd.isnull(means.loc[row['playerid'], col])
            else total_mean
        ) if pd.isnull(row[col]) else row[col],
        axis=1
    )

In [266]:
tmp.isna().sum().sum()

0

In [267]:
players = tmp
players.shape

(449120, 94)

In [268]:
temp_opp_teams = teams.groupby("gameid")["teamid"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamid")
teams = pd.concat([teams, temp_opp_teams], axis=1)

temp_opp_players = players.groupby("gameid")["teamid"].transform(lambda x: x.iloc[::-1].values).to_frame("opp_teamid")
players = pd.concat([players, temp_opp_players], axis=1)

In [270]:
teams.head()

Unnamed: 0,gameid,league,year,split,playoffs,date,game,patch,participantid,side,...,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15,opp_teamid
10,ESPORTSTMNT03/1241318,KeSPA,2020,unknown,0,2020-01-03 07:33:26,1.0,9.24,100,Blue,...,-825.0,-1665.0,-12.0,0.0,0.0,1.0,1.0,2.0,0.0,oe:team:ce499dea30cfce118f4fe85da0227e8
11,ESPORTSTMNT03/1241318,KeSPA,2020,unknown,0,2020-01-03 07:33:26,1.0,9.24,200,Red,...,825.0,1665.0,12.0,1.0,2.0,0.0,0.0,0.0,1.0,oe:team:c75f1f337fc5867914749d438a4871d
22,ESPORTSTMNT03/1241322,KeSPA,2020,unknown,0,2020-01-03 09:00:58,2.0,9.24,100,Blue,...,-5484.0,-4833.0,-56.0,0.0,0.0,6.0,6.0,7.0,0.0,oe:team:c75f1f337fc5867914749d438a4871d
23,ESPORTSTMNT03/1241322,KeSPA,2020,unknown,0,2020-01-03 09:00:58,2.0,9.24,200,Red,...,5484.0,4833.0,56.0,6.0,7.0,0.0,0.0,0.0,6.0,oe:team:ce499dea30cfce118f4fe85da0227e8
34,ESPORTSTMNT03/1241324,KeSPA,2020,unknown,0,2020-01-03 10:05:17,3.0,9.24,100,Blue,...,2566.0,2097.0,53.0,0.0,0.0,0.0,0.0,0.0,0.0,oe:team:c75f1f337fc5867914749d438a4871d
