In [121]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import re
import locale

# データの読み込み
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')
venue_data = pd.read_csv('../data/venue_information.csv')
holidays_in_japan_data = pd.read_csv('../data/holidays_in_japan.csv')
match_reports_data = pd.read_csv('../data/match_reports.csv')

# データの結合
all_data = pd.concat([train_data, test_data], ignore_index=True)
all_data = pd.merge(all_data, venue_data, on='venue', how='left')
all_data = pd.merge(all_data, holidays_in_japan_data, left_on='match_date', right_on='holiday_date', how='left')
all_data = pd.merge(all_data, match_reports_data, on=['id', 'id'], how='left')

all_data.head()
all_data.to_csv('../data_processed/all.csv', index=False)

In [122]:
all_data.columns


Index(['id', 'match_date', 'kick_off_time', 'section', 'round', 'home_team',
       'away_team', 'venue', 'weather', 'temperature', 'humidity',
       'broadcasters', 'attendance', 'capacity', 'address', 'holiday_date',
       'description', 'home_team_player11', 'home_team_player10',
       'home_team_player9', 'home_team_player8', 'home_team_player7',
       'home_team_player6', 'home_team_player5', 'home_team_player4',
       'home_team_player3', 'home_team_player2', 'home_team_player1',
       'home_team_score', 'away_team_score', 'away_team_player1',
       'away_team_player2', 'away_team_player3', 'away_team_player4',
       'away_team_player5', 'away_team_player6', 'away_team_player7',
       'away_team_player8', 'away_team_player9', 'away_team_player10',
       'away_team_player11'],
      dtype='object')

In [123]:
# 残す特徴量のリスト
keep_cols = ['match_date','kick_off_time', 'round', 'home_team', 'away_team', 'venue', 'weather', 'temperature', 'humidity', 'broadcasters', 'attendance', 'capacity', 'address', 'home_team_score', 'away_team_score']

# 指定された特徴量だけを残す
all_data = all_data[keep_cols]

# データを確認
all_data.head()


Unnamed: 0,match_date,kick_off_time,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,attendance,capacity,address,home_team_score,away_team_score
0,2006-03-04,16:04,第1日,G大阪,浦和,万博記念競技場,晴,8.3,40.0,NHK総合/J SPORTS(録),20916.0,21000,大阪府吹田市千里万博公園5-2,1,1
1,2006-03-05,13:00,第2日,甲府,清水,山梨県小瀬スポーツ公園陸上競技場,晴,12.9,28.0,山梨放送/テレビ静岡(録)/J SPORTS(録),14277.0,15859,山梨県甲府市小瀬町840,0,2
2,2006-03-05,13:35,第2日,FC東京,大分,味の素スタジアム,晴,12.1,35.0,BS-i/MXテレビ(録)/J SPORTS(録),22531.0,48999,東京都調布市西町376-3,2,0
3,2006-03-05,14:04,第2日,磐田,福岡,静岡スタジアムエコパ,晴,11.6,42.0,J SPORTS,28564.0,51697,静岡県袋井市愛野2300−1,1,1
4,2006-03-05,14:04,第2日,名古屋,C大阪,名古屋市瑞穂陸上競技場,晴,13.1,32.0,スカイパーフェクTV!/NHK名古屋(録)/NHK大阪(録)/J SPORTS(録),17199.0,20223,愛知県名古屋市瑞穂区山下通5-1,3,2


In [124]:
# 'match_date'列を日付型に変換
all_data['match_date'] = pd.to_datetime(all_data['match_date'])

def get_season(date):
    month = date.month
    if month in [3, 4, 5]:
        return 0 #Spring
    elif month in [6, 7, 8]:
        return 1 #Summer
    elif month in [9, 10, 11]:
        return 2 #Autumn
    else:
        return 3 #Winter

# 年の特徴量を作成
all_data['year'] = all_data['match_date'].dt.year
# 月の特徴量を作成
all_data['month'] = all_data['match_date'].dt.month
#季節の特徴量を作成
all_data['season'] = all_data['match_date'].apply(get_season)

# 時間データの利用
all_data['kick_off_time'] = pd.to_datetime(all_data['kick_off_time'])

# 何時台に始まったかの特徴量を追加
all_data['kick_off_time'] = all_data['kick_off_time'].dt.hour

# 数字のみを抽出する関数
def extract_number(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    else:
        return None

# 'round'列から数字を抽出して新しい列を作成
all_data['round'] = all_data['round'].apply(extract_number)


# 都道府県名を抽出する関数
def extract_prefecture(address):
    match = re.search(r'(\S+都|\S+道|\S+府|\S+県)', address)
    if match:
        return match.group()
    else:
        return None
    

# 'address'列に関数を適用
all_data['address'] = all_data['address'].apply(extract_prefecture)



# カテゴリ変数のリスト
categorical_cols = ['home_team', 'away_team', 'venue', 'address', 'broadcasters', 'weather']

# LabelEncoderのインスタンスを作成
le = LabelEncoder()

# カテゴリ変数をループしてエンコーディング
for col in categorical_cols:
    # 学習データとテストデータを合わせてエンコードする
    all_data[col] = le.fit_transform(all_data[col].astype(str))


# エンコード後のデータを確認
all_data.head(10)

Unnamed: 0,match_date,kick_off_time,round,home_team,away_team,venue,weather,temperature,humidity,broadcasters,attendance,capacity,address,home_team_score,away_team_score,year,month,season
0,2006-03-04,16,1,2,20,23,1,8.3,40.0,117,20916.0,21000,7,1,1,2006,3,0
1,2006-03-05,13,2,23,21,38,1,12.9,28.0,659,14277.0,15859,11,0,2,2006,3,0
2,2006-03-05,13,2,1,7,30,1,12.1,35.0,4,22531.0,48999,16,2,0,2006,3,0
3,2006-03-05,14,2,24,26,60,1,11.6,42.0,89,28564.0,51697,24,1,1,2006,3,0
4,2006-03-05,14,2,6,0,29,1,13.1,32.0,129,17199.0,20223,14,3,2,2006,3,0
5,2006-03-05,15,2,8,5,32,1,12.6,33.0,640,13085.0,62010,5,4,2,2006,3,0
6,2006-03-05,15,2,10,13,58,1,13.3,34.0,90,17444.0,26827,19,6,0,2006,3,0
7,2006-03-05,15,2,11,29,41,12,12.4,58.0,10,17564.0,36894,12,3,4,2006,3,0
8,2006-03-05,15,2,19,3,44,1,12.6,34.0,3,23607.0,72081,19,4,1,2006,3,0
9,2006-03-11,14,1,20,24,32,1,16.0,48.0,126,56512.0,62010,5,3,1,2006,3,0


In [126]:
all_data = all_data.drop(['match_date'], axis=1)

all_data.to_csv('../data_processed/all_processed2.csv', index=False)

In [127]:
train_data = all_data[0:3366]
test_data= all_data[3366:].drop(['attendance'], axis=1)

train_data.to_csv('../data_processed/train_data_processed2.csv', index=False)
test_data.to_csv('../data_processed/test_data_processed2.csv', index=False)