In [144]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/datathon/data/"

SEED = 42

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [145]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [146]:
df = pd.read_csv(f"{DATA_PATH}base_data_forreal.csv")

  df = pd.read_csv(f"{DATA_PATH}base_data_forreal.csv")


In [147]:
train_tr = pd.read_csv(f"{DATA_PATH}teams_train(analyze).csv") # 학습용 승부예측 데이터
#train_target = pd.read_csv(f"{DATA_PATH}train_target_1226.csv") # 학습용 정답 데이터
test_tr = pd.read_csv(f"{DATA_PATH}teams_test(analyze).csv") # 테스트용 승부예측 데이터
#test_target = pd.read_csv(f"{DATA_PATH}test_target_1226.csv") # 테스트용 정답 데이터

In [148]:
train_tr.shape, test_tr.shape

((10000, 156), (2334, 111))

# 전처리

In [149]:
drop_list_team = [
    "participantid",
    "playername",
    'position',
    "champion",
    "teamkills",
    "teamdeaths",
    "firstbloodkill",
    "firstbloodassist",
    "firstbloodvictim",
    "dragons (type unknown)",
    "damageshare",
    "earnedgoldshare",
    "total cs",
    "monsterkillsownjungle",
    "monsterkillsenemyjungle",
    "goldat20",
    "xpat20",
    "csat20",
    "opp_goldat20",
    "opp_xpat20",
    "opp_csat20",
    "golddiffat20",
    "xpdiffat20",
    "csdiffat20",
    "killsat20",
    "assistsat20",
    "deathsat20",
    "opp_killsat20",
    "opp_assistsat20",
    "opp_deathsat20",
    "goldat25",
    "xpat25",
    "csat25",
    "opp_goldat25",
    "opp_xpat25",
    "opp_csat25",
    "golddiffat25",
    "xpdiffat25",
    "csdiffat25",
    "killsat25",
    "assistsat25",
    "deathsat25",
    "opp_killsat25",
    "opp_assistsat25",
    "opp_deathsat25"
]

In [150]:
train_ft = train_tr.drop(columns=drop_list_team)
test_ft = test_tr.copy()

- teams

In [151]:
fill_zero_cols = [
    "void_grubs",
    "opp_void_grubs",
]

fill_unknown_cols = [
    "split",
    "ban1",
    "ban2",
    "ban3",
    "ban4",
    "ban5",
]

train_ft[fill_zero_cols] = train_ft[fill_zero_cols].fillna(0)
test_ft[fill_zero_cols] = test_ft[fill_zero_cols].fillna(0)

train_ft[fill_unknown_cols] = train_ft[fill_unknown_cols].fillna("unknown")
test_ft[fill_unknown_cols] = test_ft[fill_unknown_cols].fillna("unknown")

train_ft.dropna(subset=['pick1'], inplace=True)
test_ft.dropna(subset=['pick1'], inplace=True)

- lpl 결측치

In [152]:
lpl_team = [
    "Anyone's Legend",
    "Bilibili Gaming",
    "EDward Gaming",
    "FunPlus Phoenix",
    "Invictus Gaming",
    "JD Gaming",
    "LGD Gaming",
    "LNG Esports",
    "Oh My God",
    "Rare Atom",
    "Royal Never Give Up",
    "Team WE",
    "Top Esports",
    "ThunderTalk Gaming",
    "Ultra Prime",
    "Weibo Gaming",
    "Ninjas in Pyjamas"
]

In [153]:
missing_columns = train_ft.columns[train_ft.isnull().sum() > 0]
null_samples = train_ft[train_ft[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = train_ft[
        (train_ft["teamname"] == row["teamname"])
        & (train_ft["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 LCK 평균으로 채움
                lck_mean = train_ft[train_ft["league"] == "LCK"][col].mean()
                train_ft.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    train_ft.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 해당 팀의 전체 평균으로 채움
                    team_mean = train_ft[train_ft["teamname"] == row["teamname"]][col].mean()
                    train_ft.loc[idx, col] = team_mean

In [154]:
train_ft.isnull().sum().sum()

0

In [155]:
missing_columns = test_ft.columns[test_ft.isnull().sum() > 0]
null_samples = test_ft[test_ft[missing_columns].isnull().any(axis=1)]

for idx, row in null_samples.iterrows():
    team_history = train_ft[
        (train_ft["teamname"] == row["teamname"])
        & (train_ft["gameid"] < row["gameid"])
    ].sort_values("gameid", ascending=False)

    for col in missing_columns:
        if pd.isnull(row[col]):
            if row["teamname"] in lpl_team:
                # LPL 팀인 경우 학습 데이터의 LCK 평균으로 채움
                lck_mean = train_ft[train_ft["league"] == "LCK"][col].mean()
                test_ft.loc[idx, col] = lck_mean
            else:
                # LPL 팀이 아닌 경우 학습 데이터에서 해당 팀의 이전 5경기 평균으로 채움
                prev_5_mean = team_history[col].head(5).mean()
                if pd.notnull(prev_5_mean):
                    test_ft.loc[idx, col] = prev_5_mean
                else:
                    # 이전 5경기 데이터가 없는 경우 학습 데이터에서 해당 팀의 전체 평균으로 채움
                    team_mean = train_ft[train_ft["teamname"] == row["teamname"]][col].mean()
                    test_ft.loc[idx, col] = team_mean

In [156]:
test_ft.isnull().sum().sum()

0

In [157]:
train_ft.shape, test_ft.shape

((9913, 111), (2324, 111))

# feature

- 연도별 팀순위

In [158]:
train_ft["date"] = pd.to_datetime(train_ft["date"])
test_ft["date"] = pd.to_datetime(test_ft["date"])

train_ft["year"] = train_ft["date"].dt.year
test_ft["year"] = test_ft["date"].dt.year

In [159]:
# def calculate_team_rankings(df):
#     rankings = []
#     for year in df['year'].unique():
#         for league in df['league'].unique():
#             year_league_df = df[(df['year'] == year) & (df['league'] == league)]
#             if year_league_df.empty:
#                 continue

#             # 승리 횟수를 기준으로 정렬
#             team_wins = year_league_df.groupby('teamname')['result'].sum().reset_index()
#             team_wins = team_wins.sort_values(by='result', ascending=False)

#             # 순위 부여
#             team_wins['rank'] = range(1, len(team_wins) + 1)

#             # 연도, 리그 정보 추가
#             team_wins['year'] = year
#             team_wins['league'] = league

#             rankings.append(team_wins)

#     return pd.concat(rankings)
# #result 1 = win 0 = lose

# team_rankings = calculate_team_rankings(train_ft)
# team_rankings

In [160]:
train_ft.shape, test_ft.shape

((9913, 112), (2324, 112))

In [161]:
# 연도별 팀 평균kda 계산
def calculate_yearly_team_kda(df):
    # KDA 계산
    df['KDA'] = (df['kills'] + df['assists']) / df['deaths'].replace(0,1)
    df['KDA'] = df['KDA'].replace(np.inf, 0)  # 무한대 값 처리

    # 연도별 팀 평균 KDA 계산
    yearly_team_kda = df.groupby(['year', 'teamname'])['KDA'].mean().reset_index()
    return yearly_team_kda

train_year_kda = calculate_yearly_team_kda(train_ft)
test_year_kda = calculate_yearly_team_kda(test_ft)

train_year_kda

Unnamed: 0,year,teamname,KDA
0,2022,100 Thieves,7.816131
1,2022,AS Esports,2.209044
2,2022,Astralis,3.926283
3,2022,BNK FEARX,4.897943
4,2022,Beyond Gaming,8.721417
...,...,...,...
195,2024,Team Whales,5.226432
196,2024,V3 Esports,1.390224
197,2024,Vivo Keyd Stars,6.385573
198,2024,West Point Esports,2.028894


In [162]:
train_ft.shape, test_ft.shape

((9913, 113), (2324, 113))

In [163]:
# opp_teamname

def add_opponent_teamname(df):
    df['opp_teamname'] = ''  # 빈 문자열로 초기화
    for gameid in df['gameid'].unique():
        game_data = df[df['gameid'] == gameid]
        teams = game_data['teamname'].unique()
        if len(teams) == 2:
            team1 = teams[0]
            team2 = teams[1]
            df.loc[(df['gameid'] == gameid) & (df['teamname'] == team1), 'opp_teamname'] = team2
            df.loc[(df['gameid'] == gameid) & (df['teamname'] == team2), 'opp_teamname'] = team1
    return df

train_ft = add_opponent_teamname(train_ft)
test_ft = add_opponent_teamname(test_ft)
train_ft['opp_teamname']

Unnamed: 0,opp_teamname
0,BNK FEARX
1,DRX
2,BNK FEARX
3,DRX
4,Kwangdong Freecs
...,...
9995,PSG Talon
9996,Fukuoka SoftBank HAWKS gaming
9997,PSG Talon
9998,PSG Talon


In [164]:
train_ft.shape, test_ft.shape

((9913, 114), (2324, 114))

In [166]:
# 상대팀에 대한 평균 kills

def calculate_avg_kills_against_opponent(df):
    team_kills_against_opponent = {}
    for team in df['teamname'].unique():
        team_kills_against_opponent[team] = {}
        team_df = df[df['teamname'] == team]
        for opp_team in team_df['opp_teamname'].unique():
            kills_against_opp = team_df[team_df['opp_teamname'] == opp_team]['kills']
            avg_kills = kills_against_opp.mean()
            team_kills_against_opponent[team][opp_team] = avg_kills

    return team_kills_against_opponent


# 팀별 상대 팀에 대한 평균 킬 수 계산
train_avg_kills = calculate_avg_kills_against_opponent(train_ft)
test_avg_kills = calculate_avg_kills_against_opponent(test_ft)

# 결과 출력
for team, opponents in train_avg_kills.items():
    print(f"팀: {team}")
    for opponent, avg_kill in opponents.items():
        print(f"  상대 팀: {opponent}, 평균 킬: {avg_kill}")

팀: DRX
  상대 팀: BNK FEARX, 평균 킬: 10.885714285714286
  상대 팀: Gen.G, 평균 킬: 6.875
  상대 팀: T1, 평균 킬: 9.307692307692308
  상대 팀: Kwangdong Freecs, 평균 킬: 12.74074074074074
  상대 팀: OKSavingsBank Brion, 평균 킬: 11.625
  상대 팀: Nongshim RedForce, 평균 킬: 12.884615384615385
  상대 팀: KT Rolster, 평균 킬: 9.566666666666666
  상대 팀: Hanwha Life Esports, 평균 킬: 9.444444444444445
  상대 팀: Dplus KIA, 평균 킬: 7.071428571428571
  상대 팀: Royal Never Give Up, 평균 킬: 18.0
  상대 팀: MGN Vikings Esports, 평균 킬: 11.0
  상대 팀: İstanbul Wildcats, 평균 킬: 26.0
  상대 팀: Isurus, 평균 킬: 23.0
  상대 팀: MAD Lions KOI, 평균 킬: 14.0
  상대 팀: Rogue, 평균 킬: 15.333333333333334
  상대 팀: Top Esports, 평균 킬: 9.0
  상대 팀: GAM Esports, 평균 킬: 17.0
  상대 팀: EDward Gaming, 평균 킬: 13.0
  상대 팀: OKSavingsBank BRION, 평균 킬: 9.0
팀: BNK FEARX
  상대 팀: DRX, 평균 킬: 13.2
  상대 팀: Nongshim RedForce, 평균 킬: 11.84
  상대 팀: Kwangdong Freecs, 평균 킬: 11.75
  상대 팀: T1, 평균 킬: 8.91304347826087
  상대 팀: KT Rolster, 평균 킬: 8.925925925925926
  상대 팀: Hanwha Life Esports, 평균 킬: 12.04
  상대 팀: Gen.G

In [167]:
train_ft.shape,test_ft.shape

((9913, 114), (2324, 114))