# 데이터 전처리 및 Feature 선택

In [4]:
import json
import pandas as pd
import math
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Any

class FeatureExtractor:
    """Timeline 데이터에서 Feature 추출"""

    def __init__(self):
        self.position_mapping = {}  # matchId -> {participantId: position}
        self.team_mapping = {}  # matchId -> {participantId: teamId}
        self.puuid_mapping = {}  # matchId -> {participantId: puuid}
        self.win_mapping = {}  # matchId -> {participantId: win}

    def load_match_metadata(self, match_data: List[Dict]) -> None:
        """Match 데이터에서 포지션, 팀, puuid, 승패 정보 로드"""
        for match in match_data:
            match_id = match['metadata']['matchId']
            self.position_mapping[match_id] = {}
            self.team_mapping[match_id] = {}
            self.puuid_mapping[match_id] = {}
            self.win_mapping[match_id] = {}

            for p in match['info']['participants']:
                pid = p['participantId']
                self.position_mapping[match_id][pid] = p.get('teamPosition', '')
                self.team_mapping[match_id][pid] = p.get('teamId')
                self.puuid_mapping[match_id][pid] = p.get('puuid')
                self.win_mapping[match_id][pid] = p.get('win')

    # [수정됨] 특정 포지션에 맞는 상대방 ID를 찾도록 일반화
    def get_opponent_id(self, match_id: str, my_participant_id: int, my_position: str) -> int:
        my_team = self.team_mapping[match_id][my_participant_id]
        for pid, pos in self.position_mapping[match_id].items():
            # 나와 포지션이 같고, 팀이 다른 경우를 상대방으로 간주
            if pos == my_position and self.team_mapping[match_id][pid] != my_team:
                return pid
        return -1

    def get_team_participant_ids(self, match_id: str, participant_id: int) -> List[int]:
        my_team = self.team_mapping[match_id][participant_id]
        return [pid for pid, team in self.team_mapping[match_id].items() if team == my_team]

    def parse_events_cumulative(self, frames: List[Dict], until_frame: int, participant_id: int) -> Dict:
        result = {
            'kills': 0, 'deaths': 0, 'assists': 0, 'soloKills': 0,
            'turretPlates': 0, 'wardsPlaced': 0, 'objectiveParticipation': 0
        }

        for i, frame in enumerate(frames[:until_frame + 1]):
            for event in frame.get('events', []):
                event_type = event.get('type')

                if event_type == 'CHAMPION_KILL':
                    killer_id = event.get('killerId')
                    victim_id = event.get('victimId')
                    assisting = event.get('assistingParticipantIds', [])

                    if killer_id == participant_id:
                        result['kills'] += 1
                        if not assisting:
                            result['soloKills'] += 1
                    if victim_id == participant_id:
                        result['deaths'] += 1
                    if participant_id in assisting:
                        result['assists'] += 1

                elif event_type == 'TURRET_PLATE_DESTROYED':
                    if event.get('killerId') == participant_id:
                        result['turretPlates'] += 1

                elif event_type == 'WARD_PLACED':
                    if event.get('creatorId') == participant_id:
                        result['wardsPlaced'] += 1

                elif event_type == 'ELITE_MONSTER_KILL':
                    if event.get('killerId') == participant_id:
                        result['objectiveParticipation'] += 1
                    elif participant_id in event.get('assistingParticipantIds', []):
                        result['objectiveParticipation'] += 1
        return result

    def get_team_stats_at_frame(self, frame: Dict, team_pids: List[int]) -> Dict:
        team_damage_to_champs = 0
        team_damage_taken = 0
        for pid in team_pids:
            pf = frame['participantFrames'].get(str(pid), {})
            ds = pf.get('damageStats', {})
            team_damage_to_champs += ds.get('totalDamageDoneToChampions', 0)
            team_damage_taken += ds.get('totalDamageTaken', 0)
        return {'teamDamageToChamps': team_damage_to_champs, 'teamDamageTaken': team_damage_taken}

    def count_team_kills(self, frames: List[Dict], until_frame: int, team_pids: List[int]) -> int:
        team_kills = 0
        for i, frame in enumerate(frames[:until_frame + 1]):
            for event in frame.get('events', []):
                if event.get('type') == 'CHAMPION_KILL':
                    if event.get('killerId') in team_pids:
                        team_kills += 1
        return team_kills

    def extract_features_for_participant(self, match_id: str, frames: List[Dict], participant_id: int, position: str) -> \
    List[Dict]:
        # [수정됨] 포지션에 상관없이 해당 포지션의 상대방 ID를 가져옴
        opponent_id = self.get_opponent_id(match_id, participant_id, position)
        team_pids = self.get_team_participant_ids(match_id, participant_id)

        records = []
        for frame_idx, frame in enumerate(frames):
            pf = frame['participantFrames'].get(str(participant_id))
            if not pf: continue

            # 상대방 데이터가 있으면 가져오고, 없으면(상대 라이너 없음 등) 빈 값 처리
            opp_pf = frame['participantFrames'].get(str(opponent_id), {}) if opponent_id > 0 else {}
            opp_ds = opp_pf.get('damageStats', {})
            event_features = self.parse_events_cumulative(frames, frame_idx, participant_id)
            opp_event_features = self.parse_events_cumulative(frames, frame_idx, opponent_id) if opponent_id > 0 else {
                'kills': 0, 'deaths': 0, 'assists': 0, 'soloKills': 0,
                'turretPlates': 0, 'wardsPlaced': 0, 'objectiveParticipation': 0
            }
            team_stats = self.get_team_stats_at_frame(frame, team_pids)
            team_kills = self.count_team_kills(frames, frame_idx, team_pids)
            ds = pf.get('damageStats', {})

            xp = pf.get('xp', 0)
            level = pf.get('level', 1)
            minions_killed = pf.get('minionsKilled', 0)
            jungle_minions_killed = pf.get('jungleMinionsKilled', 0)
            total_damage_to_champs = ds.get('totalDamageDoneToChampions', 0)
            total_damage_taken = ds.get('totalDamageTaken', 0)
            time_enemy_controlled = pf.get('timeEnemySpentControlled', 0)

            opp_xp = opp_pf.get('xp', 0)
            opp_level = opp_pf.get('level', 1)
            opp_minions = opp_pf.get('minionsKilled', 0) + opp_pf.get('jungleMinionsKilled', 0)
            opp_damage_to_champs = opp_ds.get('totalDamageDoneToChampions', 0)

            total_cs = minions_killed + jungle_minions_killed
            kills = event_features['kills']
            deaths = event_features['deaths']
            assists = event_features['assists']

            kda = (kills + assists) / max(deaths, 1)
            kill_participation = (kills + assists) / max(team_kills, 1)
            damage_share = total_damage_to_champs / max(team_stats['teamDamageToChamps'], 1)

            xp_diff = xp - opp_xp
            cs_diff = total_cs - opp_minions
            damage_diff = total_damage_to_champs - opp_damage_to_champs
            kills_diff = kills - opp_event_features['kills']
            level_diff = level - opp_level

            total_gold = pf.get('totalGold', 0)

            record = {
                'matchId': match_id,
                'participantId': participant_id,
                'puuid': self.puuid_mapping.get(match_id, {}).get(participant_id, ''),
                'position': position,
                'minute': frame_idx,
                'win': self.win_mapping.get(match_id, {}).get(participant_id),
                'xp': xp, 'level': level, 'minionsKilled': minions_killed,
                'jungleMinionsKilled': jungle_minions_killed,
                'totalDamageDoneToChampions': total_damage_to_champs,
                'totalDamageTaken': total_damage_taken,
                'timeEnemySpentControlled': time_enemy_controlled,
                'kills': kills, 'deaths': deaths, 'assists': assists,
                'soloKills': event_features['soloKills'],
                'turretPlates': event_features['turretPlates'],
                'wardsPlaced': event_features['wardsPlaced'],
                'objectiveParticipation': event_features['objectiveParticipation'],
                'xpDiff': xp_diff, 'csDiff': cs_diff, 'damageDiff': damage_diff,
                'killsDiff': kills_diff, 'levelDiff': level_diff,
                'KDA': kda, 'killParticipation': kill_participation, 'damageShare': damage_share,
                'totalGold': total_gold
            }
            records.append(record)
        return records

    def extract_all_top_features(self, timeline_data: List[Dict], match_data: List[Dict]) -> pd.DataFrame:
        """배치 단위 처리를 위해 로직 유지 (이름은 기존 호환성을 위해 유지하되 내용은 전체 포지션 처리)"""
        self.load_match_metadata(match_data)
        timeline_by_match = {t['metadata']['matchId']: t for t in timeline_data}

        # [수정됨] 추출할 대상 포지션 목록 정의
        target_positions = {'TOP', 'JUNGLE', 'MIDDLE', 'BOTTOM', 'UTILITY'}

        all_records = []
        for match in match_data:
            match_id = match['metadata']['matchId']
            if match_id not in timeline_by_match: continue

            timeline = timeline_by_match[match_id]
            frames = timeline['info']['frames']

            for pid, pos in self.position_mapping[match_id].items():
                # [수정됨] TOP뿐만 아니라 target_positions에 포함된 모든 포지션에 대해 추출
                if pos in target_positions:
                    records = self.extract_features_for_participant(match_id, frames, pid, pos)
                    all_records.extend(records)
        return pd.DataFrame(all_records)


# --- [유틸리티 함수] ---

def load_json_file_single(path: str) -> Dict:
    """단일 JSON 파일 로드"""
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)


def discover_data_files(data_dir: str = 'match_data') -> Tuple[List[str], List[str]]:
    data_path = Path(data_dir)
    match_files = sorted(data_path.glob('match_*.json'))
    timeline_files = sorted(data_path.glob('timeline_*.json'))
    return [str(f) for f in match_files], [str(f) for f in timeline_files]


FEATURE_COLUMNS = [
    'xp', 'level', 'minionsKilled', 'jungleMinionsKilled',
    'totalDamageDoneToChampions', 'totalDamageTaken', 'timeEnemySpentControlled',
    'kills', 'deaths', 'assists', 'soloKills',
    'turretPlates', 'wardsPlaced', 'objectiveParticipation',
    'xpDiff', 'csDiff', 'damageDiff', 'killsDiff', 'levelDiff',
    'KDA', 'killParticipation', 'damageShare'
]
TARGET_COLUMN = 'totalGold'
META_COLUMNS = ['matchId', 'participantId', 'puuid', 'position', 'minute', 'win']

# --- [Main 실행 부분] ---

if __name__ == '__main__':
    print("Discovering data files in match_data directory...")
    match_files, timeline_files = discover_data_files('data/match_data')

    total_files = len(match_files)
    print(f"Found {total_files} match/timeline file pairs.")

    if total_files == 0:
        print("Error: No data files found.")
        exit(1)

    extractor = FeatureExtractor()
    all_dfs = []  # 배치별 결과를 모을 리스트

    # 배치 사이즈 설정
    BATCH_SIZE = 10

    print("\nStarting batch processing...")

    for i in range(0, total_files, BATCH_SIZE):
        batch_match_paths = match_files[i: i + BATCH_SIZE]
        batch_timeline_paths = timeline_files[i: i + BATCH_SIZE]

        batch_match_data = [load_json_file_single(p) for p in batch_match_paths]
        batch_timeline_data = [load_json_file_single(p) for p in batch_timeline_paths]

        flat_match_data = []
        for d in batch_match_data:
            if isinstance(d, list):
                flat_match_data.extend(d)
            else:
                flat_match_data.append(d)

        flat_timeline_data = []
        for d in batch_timeline_data:
            if isinstance(d, list):
                flat_timeline_data.extend(d)
            else:
                flat_timeline_data.append(d)

        batch_df = extractor.extract_all_top_features(flat_timeline_data, flat_match_data)
        all_dfs.append(batch_df)

        current_count = min(i + BATCH_SIZE, total_files)
        print(f"✅ Processed {current_count}/{total_files} files ({(current_count / total_files) * 100:.1f}%)")

    print("\nConcatenating all batches...")
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
    else:
        final_df = pd.DataFrame()

    print(f"\nExtracted {len(final_df)} rows total.")

    if not final_df.empty:
        ratio_columns = ['KDA', 'killParticipation', 'damageShare']
        for col in ratio_columns:
            if col in final_df.columns:
                final_df[col] = final_df[col].round(4)

        # [수정됨] 출력 파일명을 전체 포지션에 맞게 변경
        output_path = 'data/all_positions_features.csv'
        final_df.to_csv(output_path, index=False)
        print(f"\nSaved to {output_path}")
    else:
        print("No features extracted.")

Discovering data files in match_data directory...
Found 1075 match/timeline file pairs.

Starting batch processing...
✅ Processed 10/1075 files (0.9%)
✅ Processed 20/1075 files (1.9%)
✅ Processed 30/1075 files (2.8%)
✅ Processed 40/1075 files (3.7%)
✅ Processed 50/1075 files (4.7%)
✅ Processed 60/1075 files (5.6%)
✅ Processed 70/1075 files (6.5%)
✅ Processed 80/1075 files (7.4%)
✅ Processed 90/1075 files (8.4%)
✅ Processed 100/1075 files (9.3%)
✅ Processed 110/1075 files (10.2%)
✅ Processed 120/1075 files (11.2%)
✅ Processed 130/1075 files (12.1%)
✅ Processed 140/1075 files (13.0%)
✅ Processed 150/1075 files (14.0%)
✅ Processed 160/1075 files (14.9%)
✅ Processed 170/1075 files (15.8%)
✅ Processed 180/1075 files (16.7%)
✅ Processed 190/1075 files (17.7%)
✅ Processed 200/1075 files (18.6%)
✅ Processed 210/1075 files (19.5%)
✅ Processed 220/1075 files (20.5%)
✅ Processed 230/1075 files (21.4%)
✅ Processed 240/1075 files (22.3%)
✅ Processed 250/1075 files (23.3%)
✅ Processed 260/1075 files

### 봇 데이터 병합 및 최종 저장

In [5]:
import pandas as pd
import numpy as np

def merge_bot_duo_and_save(input_path, output_path):
    print(f"Loading raw data from {input_path}...")
    df = pd.read_csv(input_path)

    # 1. 바텀 듀오(BOTTOM, UTILITY)와 솔로 라이너(TOP, JUNGLE, MIDDLE) 분리
    duo_df = df[df['position'].isin(['BOTTOM', 'UTILITY'])].copy()
    solo_df = df[~df['position'].isin(['BOTTOM', 'UTILITY'])].copy()

    print("Merging BOTTOM and UTILITY...")

    # 2. 바텀 듀오 합치기 (MatchId, Minute, TeamId 등이 같은 그룹끼리)
    # 주의: 문자열 데이터(win, puuid 등)와 숫자 데이터를 구분해서 처리

    # 2-1. 단순 합산할 컬럼들 (수치형)
    sum_cols = [
        'kills', 'deaths', 'assists', 'minionsKilled', 'jungleMinionsKilled',
        'totalDamageDoneToChampions', 'totalDamageTaken', 'timeEnemySpentControlled',
        'wardsPlaced', 'turretPlates', 'objectiveParticipation', 'totalGold',
        'xpDiff', 'csDiff', 'damageDiff', 'killsDiff', 'levelDiff' # 격차도 합산하면 '듀오 간 격차'가 됨
    ]

    # 2-2. 평균을 낼 컬럼들 (선택 사항, 레벨 등)
    mean_cols = ['xp', 'level']

    # 그룹화 기준 (경기, 시간, 승패는 동일하므로 포함)
    group_keys = ['matchId', 'minute', 'win']

    # 합산 실행
    duo_sum = duo_df.groupby(group_keys)[sum_cols].sum().reset_index()
    duo_mean = duo_df.groupby(group_keys)[mean_cols].mean().reset_index()

    # 데이터 병합
    merged_duo = pd.merge(duo_sum, duo_mean, on=group_keys)

    # 3. 파생 변수 재계산 (비율 데이터는 합치면 왜곡되므로 다시 구함)
    # KDA 재계산
    merged_duo['KDA'] = (merged_duo['kills'] + merged_duo['assists']) / merged_duo['deaths'].replace(0, 1)
    # 포지션 이름 지정
    merged_duo['position'] = 'BOT_DUO'
    # PUUID는 두 명이라 합치기 애매하므로 'BOT_DUO'로 대체하거나 이어붙임
    merged_duo['puuid'] = 'DUO_PLAYER'

    # (선택) 킬 관여율 등은 팀 전체 데이터가 필요해서 여기선 생략하거나 근사치 사용
    # 기존 컬럼 형식을 맞추기 위해 빈 컬럼 채우기 (필요하다면)
    merged_duo['killParticipation'] = 0 # 필요시 로직 추가
    merged_duo['damageShare'] = 0       # 필요시 로직 추가

    # 4. 솔로 라이너와 다시 합치기
    # 컬럼 순서 맞추기
    final_df = pd.concat([solo_df, merged_duo], ignore_index=True)

    # 5. 정렬 (경기 -> 시간 -> 포지션 순)
    final_df = final_df.sort_values(['matchId', 'minute', 'position'])

    # 저장
    final_df.to_csv(output_path, index=False)
    print(f"✅ Merged {len(df)} rows into {len(final_df)} rows.")
    print(f"✅ Saved to {output_path}")
    print("Now you have positions: ", final_df['position'].unique())

# 실행
if __name__ == "__main__":
    # 01번 파일에서 저장한 경로
    INPUT_FILE = 'data/all_positions_features.csv'
    # 02~06번 파일에서 사용할 경로
    OUTPUT_FILE = 'data/final_features.csv'

    merge_bot_duo_and_save(INPUT_FILE, OUTPUT_FILE)

Loading raw data from data/all_positions_features.csv...
Merging BOTTOM and UTILITY...
✅ Merged 11270680 rows into 9016544 rows.
✅ Saved to data/final_features_ready.csv
Now you have positions:  ['BOT_DUO' 'JUNGLE' 'MIDDLE' 'TOP']
