In [61]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re

# データの読み込み
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
venue_data = pd.read_csv('../data/venue_information.csv')
holidays_in_japan_data = pd.read_csv('../data/holidays_in_japan.csv')
match_reports_data = pd.read_csv('../data/match_reports.csv')

# データの結合
all_data = pd.concat([train_data, test_data], ignore_index=True)
all_data = pd.merge(all_data, venue_data, on='venue', how='left')
all_data = pd.merge(all_data, holidays_in_japan_data, left_on='match_date', right_on='holiday_date', how='left')
all_data = pd.merge(all_data, match_reports_data, on=['id', 'id'], how='left')

all_data.head()
all_data.to_csv('../data_processed/all.csv', index=False)

In [62]:

# 日付データの利用
all_data['match_date'] = pd.to_datetime(all_data['match_date'])
all_data['year'] = all_data['match_date'].dt.year
all_data['month'] = all_data['match_date'].dt.month
all_data['day'] = all_data['match_date'].dt.day
all_data['dayofweek'] = all_data['match_date'].dt.dayofweek
# 曜日を追加
all_data['day_of_week'] = all_data['match_date'].dt.dayofweek
# 祝日フラグを追加
all_data['is_holiday'] = all_data['holiday_date'].notna().astype(int)

def get_season(month):
    if month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    elif month in [9, 10, 11]:
        return 'autumn'
    else:
        return 'winter'

all_data['season'] = all_data['month'].apply(get_season)

# 時間帯に変換
all_data['kick_off_time'] = pd.to_datetime(all_data['kick_off_time'])
all_data['time_of_day'] = pd.cut(all_data['kick_off_time'].dt.hour, bins=[0, 11, 17, 24], labels=['morning', 'afternoon', 'night'], include_lowest=True)

# 時間を数値に変換
all_data['kick_off_hour'] = all_data['kick_off_time'].dt.hour + all_data['kick_off_time'].dt.minute / 60

# 都道府県名を抽出する関数
def extract_prefecture(address):
    match = re.search(r'(\w+都|\w+道|\w+府|\w+県)', address)
    if match:
        return match.group()
    else:
        return None

# 'address'列に関数を適用
all_data['prefecture'] = all_data['address'].apply(extract_prefecture)


# カテゴリ変数のリスト
categorical_cols = ['section','round','home_team', 'away_team', 'venue', 'weather','broadcasters','time_of_day','season', 'prefecture']

# LabelEncoderのインスタンスを作成
le = LabelEncoder()

# カテゴリ変数をループしてエンコーディング
for col in categorical_cols:
    # 学習データとテストデータを合わせてエンコードする
    all_data[col] = le.fit_transform(all_data[col].astype(str))


# エンコード後のデータを確認
all_data.head()

all_data.to_csv('../data_processed/all_processed.csv', index=False)

all_data = all_data[['section', 'round', 'home_team',
       'away_team', 'venue', 'weather', 'temperature', 'humidity',
       'broadcasters', 'attendance', 'capacity',
       'home_team_score', 'away_team_score', 'year', 'month', 'day', 'dayofweek',
       'day_of_week', 'is_holiday', 'season', 'time_of_day', 'kick_off_hour',
       'prefecture']]


train_data = all_data[0:3366]
test_data= all_data[3366:].drop(['attendance'], axis=1)

train_data.to_csv('../data_processed/train_data_processed.csv', index=False)
test_data.to_csv('../data_processed/test_data_processed.csv', index=False)