In [1]:
import pandas as pd

DATA_PATH = "LoLesports_data/"

df = pd.read_csv(f"{DATA_PATH}base_data_forreal.csv", low_memory=False)

train = df[(df["patch"] >= 12.01) & (df["patch"] <= 14.05)]
test = df[df["patch"] >= 14.06]

train.shape, test.shape

((60000, 156), (14004, 156))

# 팀, 선수 데이터 분리

In [2]:
teams_train = train[train["position"] == "team"].reset_index(drop=True)
players_train = train[train["position"] != "team"].reset_index(drop=True)

teams_test = test[test["position"] == "team"].reset_index(drop=True)
players_test = test[test["position"] != "team"].reset_index(drop=True)

teams_train.shape, players_train.shape, teams_test.shape, players_test.shape

((10000, 156), (50000, 156), (2334, 156), (11670, 156))

In [3]:
teams_train.head()

Unnamed: 0,gameid,league,split,playoffs,date,game,patch,participantid,side,position,...,opp_csat25,golddiffat25,xpdiffat25,csdiffat25,killsat25,assistsat25,deathsat25,opp_killsat25,opp_assistsat25,opp_deathsat25
0,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 6:20,1,12.01,100,Blue,team,...,,,,,,,,,,
1,ESPORTSTMNT01_2700815,LCK,Spring,0,2022-01-12 6:20,1,12.01,200,Red,team,...,,,,,,,,,,
2,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 9:02,2,12.01,100,Blue,team,...,,,,,,,,,,
3,ESPORTSTMNT01_2690695,LCK,Spring,0,2022-01-12 9:02,2,12.01,200,Red,team,...,,,,,,,,,,
4,ESPORTSTMNT01_2690705,LCK,Spring,0,2022-01-12 10:07,1,12.01,100,Blue,team,...,,,,,,,,,,


# 결측치 처리

## 팀 데이터 결측치 처리

In [4]:
drop_teams_cols = [
    "participantid",
    "playername",
    "champion",
    "position",
    "teamkills",
    "teamdeaths",
    "firstbloodkill",
    "firstbloodassist",
    "firstbloodvictim",
    "dragons (type unknown)",
    "damageshare",
    "earnedgoldshare",
    "total cs",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25"
]

fill_zero_cols = [
    "void_grubs",
    "opp_void_grubs",
]

fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
]

teams_train.drop(columns=drop_teams_cols, inplace=True)
teams_test.drop(columns=drop_teams_cols, inplace=True)

teams_train[fill_zero_cols] = teams_train[fill_zero_cols].fillna(0)
teams_test[fill_zero_cols] = teams_test[fill_zero_cols].fillna(0)

teams_train[fill_unknown_cols] = teams_train[fill_unknown_cols].fillna("unknown")
teams_test[fill_unknown_cols] = teams_test[fill_unknown_cols].fillna("unknown")

teams_train.dropna(subset=['pick1'], inplace=True)
teams_test.dropna(subset=['pick1'], inplace=True)

In [5]:
lpl_team = [
    "Anyone's Legend",
    "Bilibili Gaming",
    "EDward Gaming",
    "FunPlus Phoenix",
    "Invictus Gaming",
    "JD Gaming",
    "LGD Gaming",
    "LNG Esports",
    "Oh My God",
    "Rare Atom",
    "Royal Never Give Up",
    "Team WE",
    "Top Esports",
    "ThunderTalk Gaming",
    "Ultra Prime",
    "Weibo Gaming",
    "Ninjas in Pyjamas"
]

missing_columns = teams_train.columns[teams_train.isnull().sum() > 0]
null_samples = teams_train[teams_train[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = teams_train[
        (teams_train["teamname"] == row["teamname"])
        & (teams_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 LCK 평균으로 채움
                lck_mean = teams_train[teams_train["league"] == "LCK"][col].mean()
                teams_train.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    teams_train.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 해당 팀의 전체 평균으로 채움
                    team_mean = teams_train[teams_train["teamname"] == row["teamname"]][col].mean()
                    teams_train.loc[idx, col] = team_mean

In [6]:
teams_train.shape

(9913, 111)

In [7]:
teams_train.isnull().sum().sum()

0

In [8]:
missing_columns = teams_test.columns[teams_test.isnull().sum() > 0]
null_samples = teams_test[teams_test[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = teams_train[
        (teams_train["teamname"] == row["teamname"])
        & (teams_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 학습 데이터의 LCK 평균으로 채움
                lck_mean = teams_train[teams_train["league"] == "LCK"][col].mean()
                teams_test.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 학습 데이터에서 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    teams_test.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 학습 데이터에서 해당 팀의 전체 평균으로 채움
                    team_mean = teams_train[teams_train["teamname"] == row["teamname"]][col].mean()
                    teams_test.loc[idx, col] = team_mean

In [9]:
teams_test.isnull().sum().sum()

0

## 선수 데이터 결측치 처리

In [10]:
drop_players_cols = [
    "participantid",
    "pick1",
    "pick2",
    "pick3",
    "pick4",
    "pick5",
    "teamkills",
    "teamdeaths",
    "firstdragon",
    "dragons",
    "opp_dragons",
    "elementaldrakes",
    "opp_elementaldrakes",
    "infernals",
    "mountains",
    "clouds",
    "oceans",
    "chemtechs",
    "hextechs",
    "dragons (type unknown)",
    "elders",
    "opp_elders",
    "firstherald",
    "heralds",
    "opp_heralds",
    "void_grubs",
    "opp_void_grubs",
    "firstbaron",
    "firsttower",
    "towers",
    "opp_towers",
    "firstmidtower",
    "firsttothreetowers",
    "turretplates",
    "opp_turretplates",
    "gspd",
    "gpr",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25"
]

fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
]

players_train.drop(columns=drop_players_cols, inplace=True)
players_test.drop(columns=drop_players_cols, inplace=True)

players_train[fill_unknown_cols] = players_train[fill_unknown_cols].fillna("unknown")
players_test[fill_unknown_cols] = players_test[fill_unknown_cols].fillna("unknown")

In [11]:
missing_columns = players_train.columns[players_train.isnull().sum() > 0]
null_samples = players_train[players_train[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    player_history = players_train[
        (players_train["playername"] == row["playername"])
        & (players_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            prev_5_mean = player_history[col].head(5).mean()
            if pd.notnull(prev_5_mean):
                players_train.loc[idx, col] = prev_5_mean
            else:
                position_mean = players_train[
                    (players_train["position"] == row["position"])
                ][col].mean()
                players_train.loc[idx, col] = position_mean

In [12]:
players_train.isna().sum().sum()

0

In [13]:
missing_columns = players_test.columns[players_test.isnull().sum() > 0]
null_samples = players_test[players_test[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    player_history = players_train[
        (players_train["playername"] == row["playername"])
        & (players_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            prev_5_mean = player_history[col].head(5).mean()
            if pd.notnull(prev_5_mean):
                players_test.loc[idx, col] = prev_5_mean
            else:
                position_mean = players_train[
                    (players_train["position"] == row["position"])
                ][col].mean()
                players_test.loc[idx, col] = position_mean

In [14]:
players_test.isna().sum().sum()

0

In [15]:
teams_train_target = teams_train[["gameid", "teamname", "result"]]
teams_test_target = teams_test[["gameid", "teamname", "result"]]

players_train_target = players_train[["gameid", "playername", "result"]]
players_test_target = players_test[["gameid", "playername", "result"]]

teams_train.to_csv(f"{DATA_PATH}teams_train.csv", index=False)
teams_test.to_csv(f"{DATA_PATH}teams_test.csv", index=False)
players_train.to_csv(f"{DATA_PATH}players_train.csv", index=False)
players_test.to_csv(f"{DATA_PATH}players_test.csv", index=False)

teams_train_target.to_csv(f"{DATA_PATH}teams_train_target.csv", index=False)
teams_test_target.to_csv(f"{DATA_PATH}teams_test_target.csv", index=False)
players_train_target.to_csv(f"{DATA_PATH}players_train_target.csv", index=False)
players_test_target.to_csv(f"{DATA_PATH}players_test_target.csv", index=False)