# 데이터 불러오기

In [5]:
DATA_PATH = "LoLesports_data/"
SEED = 42

In [6]:
import pandas as pd
import numpy as np

In [7]:
team_train_tr = pd.read_csv(f"{DATA_PATH}teams_train(analyze).csv")
team_train_target = pd.read_csv(f"{DATA_PATH}teams_train_target.csv")
team_test_tr = pd.read_csv(f"{DATA_PATH}teams_test(analyze).csv")

player_train_tr = pd.read_csv(f"{DATA_PATH}players_train(analyze).csv")
player_test_tr = pd.read_csv(f"{DATA_PATH}players_test(analyze).csv")

In [8]:
team_train_tr.shape, team_test_tr.shape, player_train_tr.shape, player_test_tr.shape

((10000, 156), (2334, 111), (50000, 87), (11670, 87))

In [9]:
team_train_target.shape

(10000, 3)

In [10]:
lck_teams = team_test_tr[team_test_tr['league'] == 'LCK']['teamname'].unique()
lck_teams

array(['Hanwha Life Esports', 'Kwangdong Freecs', 'KT Rolster',
       'Dplus KIA', 'Gen.G', 'T1', 'Nongshim RedForce', 'BNK FEARX',
       'OKSavingsBank BRION', 'DRX'], dtype=object)

# 데이터 전처리

##  team

In [11]:
drop_list_team = [
    "participantid",
    "playername",
    'position',
    "champion",
    "teamkills",
    "teamdeaths",
    "firstbloodkill",
    "firstbloodassist",
    "firstbloodvictim",
    "dragons (type unknown)",
    "damageshare",
    "earnedgoldshare",
    "total cs",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25"
]

team_train = team_train_tr.drop(columns=drop_list_team)
team_test = team_test_tr.copy()

In [12]:
fill_zero_cols = [
    "void_grubs",
    "opp_void_grubs",
]

fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
]

team_train[fill_zero_cols] = team_train[fill_zero_cols].fillna(0)
team_test[fill_zero_cols] = team_test[fill_zero_cols].fillna(0)

team_train[fill_unknown_cols] = team_train[fill_unknown_cols].fillna("unknown")
team_test[fill_unknown_cols] = team_test[fill_unknown_cols].fillna("unknown")

team_train.dropna(subset=['pick1'], inplace=True)
team_test.dropna(subset=['pick1'], inplace=True)

In [None]:
lpl_team = [
    "Anyone's Legend",
    "Bilibili Gaming",
    "EDward Gaming",
    "FunPlus Phoenix",
    "Invictus Gaming",
    "JD Gaming",
    "LGD Gaming",
    "LNG Esports",
    "Oh My God",
    "Rare Atom",
    "Royal Never Give Up",
    "Team WE",
    "Top Esports",
    "ThunderTalk Gaming",
    "Ultra Prime",
    "Weibo Gaming",
    "Ninjas in Pyjamas"
]

In [None]:
missing_columns = team_train.columns[team_train.isnull().sum() > 0]
null_samples = team_train[team_train[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = team_train[
        (team_train["teamname"] == row["teamname"])
        & (team_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 LCK 평균으로 채움
                lck_mean = team_train[team_train["league"] == "LCK"][col].mean()
                team_train.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    team_train.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 해당 팀의 전체 평균으로 채움
                    team_mean = team_train[team_train["teamname"] == row["teamname"]][col].mean()
                    team_train.loc[idx, col] = team_mean

In [None]:
team_train.isnull().sum().sum()

0

In [None]:
missing_columns = team_test.columns[team_test.isnull().sum() > 0]
null_samples = team_test[team_test[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = team_train[
        (team_train["teamname"] == row["teamname"])
        & (team_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 학습 데이터의 LCK 평균으로 채움
                lck_mean = team_train[team_train["league"] == "LCK"][col].mean()
                team_test.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 학습 데이터에서 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    team_test.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 학습 데이터에서 해당 팀의 전체 평균으로 채움
                    team_mean = team_train[team_train["teamname"] == row["teamname"]][col].mean()
                    team_test.loc[idx, col] = team_mean

In [None]:
team_test.isnull().sum().sum()

0

In [None]:
team_train.shape, team_test.shape

((9913, 111), (2324, 111))

## player

In [None]:
player_train = player_train_tr.copy()
player_test = player_test_tr.copy()

In [None]:
fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
]

player_train[fill_unknown_cols] = player_train[fill_unknown_cols].fillna("unknown")
player_test[fill_unknown_cols] = player_test[fill_unknown_cols].fillna("unknown")


In [None]:
player_train.shape, player_test.shape

((50000, 87), (11670, 87))

In [None]:
missing_columns = player_train.columns[player_train.isnull().sum() > 0]
null_samples = player_train[player_train[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    player_history = player_train[
        (player_train["playername"] == row["playername"])
        & (player_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            prev_5_mean = player_history[col].head(5).mean()
            if pd.notnull(prev_5_mean):
                player_train.loc[idx, col] = prev_5_mean
            else:
                position_mean = player_train[
                    (player_train["position"] == row["position"])
                ][col].mean()
                player_train.loc[idx, col] = position_mean

In [None]:
player_train.isnull().sum().sum()

0

In [None]:
missing_columns = player_test.columns[player_test.isnull().sum() > 0]
null_samples =  player_test[player_test[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    player_history = player_train[
        (player_train["playername"] == row["playername"])
        & (player_train["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            prev_5_mean = player_history[col].head(5).mean()
            if pd.notnull(prev_5_mean):
                 player_test.loc[idx, col] = prev_5_mean
            else:
                position_mean = player_train[
                    (player_train["position"] == row["position"])
                ][col].mean()
                player_test.loc[idx, col] = position_mean

In [None]:
player_test.isnull().sum().sum()

0

In [None]:
player_train.shape, player_test.shape

((50000, 87), (11670, 87))

# 특성추가

## team

In [None]:
team_train["date"] = pd.to_datetime(team_train["date"])
team_test["date"] = pd.to_datetime(team_test["date"])

team_train["year"] = team_train["date"].dt.year
team_test["year"] = team_test["date"].dt.year

In [None]:
team_train.shape, team_test.shape

((9913, 112), (2324, 112))

## player

In [None]:
def calculate_game_kda(df):
    # KDA 계산
    df['kda'] = (df['kills'] + df['assists']) / df['deaths'].replace(0,1)
    df['kda'] = df['kda'].replace(np.inf, 0)  # 무한대 값 처리
    return df[['gameid', 'playername', 'champion', 'kda']]

player_train_game_kda = calculate_game_kda(player_train)
player_train_game_kda

Unnamed: 0,gameid,playername,champion,kda
0,ESPORTSTMNT01_2700815,Kingen,Graves,0.200000
1,ESPORTSTMNT01_2700815,Pyosik,Viego,1.000000
2,ESPORTSTMNT01_2700815,Zeka,Ryze,0.000000
3,ESPORTSTMNT01_2700815,Deft,Aphelios,2.500000
4,ESPORTSTMNT01_2700815,BeryL,Sona,2.500000
...,...,...,...,...
49995,LOLTMNT01_82652,Azhi,Rumble,15.000000
49996,LOLTMNT01_82652,JunJia,Vi,4.333333
49997,LOLTMNT01_82652,Maple,Taliyah,14.000000
49998,LOLTMNT01_82652,Betty,Zeri,6.000000


In [None]:
def calculate_game_kda(df):
    # KDA 계산
    df['kda'] = (df['kills'] + df['assists']) / df['deaths'].replace(0,1)
    df['kda'] = df['kda'].replace(np.inf, 0)  # 무한대 값 처리
    return df[['gameid', 'playername', 'champion', 'kda']]

player_train_game_kda = calculate_game_kda(player_test)
player_train_game_kda

Unnamed: 0,gameid,playername,champion,kda
0,LOLTMNT02_66852,Doran,Aatrox,5.666667
1,LOLTMNT02_66852,Peanut,Maokai,19.000000
2,LOLTMNT02_66852,Zeka,Corki,21.000000
3,LOLTMNT02_66852,Viper,Zeri,9.500000
4,LOLTMNT02_66852,Delight,Alistar,7.333333
...,...,...,...,...
11665,LOLTMNT05_90307,Zeus,Gragas,8.000000
11666,LOLTMNT05_90307,Oner,Xin Zhao,9.000000
11667,LOLTMNT05_90307,Faker,Galio,10.000000
11668,LOLTMNT05_90307,Gumayusi,Xayah,6.000000


In [None]:
player_train.shape, player_test.shape

((50000, 88), (11670, 88))

In [None]:
player_train["date"] = pd.to_datetime(player_train["date"])
player_test["date"] = pd.to_datetime(player_test["date"])

player_train["year"] = player_train["date"].dt.year
player_test["year"] = player_test["date"].dt.year

In [None]:
player_train.shape, player_test.shape

((50000, 89), (11670, 89))

In [None]:
team_train.shape, team_test.shape, player_train.shape, player_test.shape

((9913, 112), (2324, 112), (50000, 89), (11670, 89))

# train+test

- 원래는 모델링 방향으로 진행하려하였으나 모델링이 필요하지 않다는 판단하에 train와  test로 나누었던 데이터를 합쳐 EDA 진행

In [None]:
combined_team_df = pd.concat([team_train, team_test], ignore_index=True)

combined_team_df.shape

(12237, 112)

In [None]:
combined_team_df.to_csv(f'output/team.csv', index=False)

In [None]:
combined_player_df = pd.concat([player_train, player_test], ignore_index=True)

combined_player_df.shape

(61670, 89)

In [None]:
combined_player_df.to_csv(f'output/player.csv', index=False)