In [19]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb


In [20]:
def calculate_age(born, year):
    """year年4月1日時点での年齢を返す"""
    date = datetime(year,4,1)
    return date.year - born.year - ((date.month, date.day) < (born.month, born.day))

def is_same_team(df, col):
    return (df["team"] in df[col])*1

In [21]:
# データ読み込み
train_2018_df = pd.read_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2018.csv')
train_2017_df = pd.read_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2017.csv')
train_2016_df = pd.read_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2016.csv')

train_2018_df['year'] = 2018
train_2017_df['year'] = 2017
train_2016_df['year'] = 2016

test_df = pd.read_csv("/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/test.csv")
test_df["year"] = 2019

t_df_dic = {2018: test_df, 2017: train_2018_df, 2016: train_2017_df, 2015: train_2016_df}

for year in [2018,2017,2016,2015]:
    t_df = t_df_dic[year]
    t_df["name-team"] = t_df["name"] + "-" + t_df["prev1_team"].apply(lambda x: x.split("・")[-1])
    
    eve_pla_df = pd.read_csv(f"/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/event_play_{year}.csv")
    
    for df in [eve_pla_df]:
        df["name-team"] = df["name"] + "-" + df["team"]
        df.drop(["name", "team"], axis=1, inplace=True)
        t_df = pd.merge(t_df, df, on="name-team", how="left")
        
    t_df.drop(["name-team"], axis=1, inplace=True)
    t_df_dic[year] = t_df
    
test_df = t_df_dic[2018]
train_2018_df = t_df_dic[2017]
train_2017_df = t_df_dic[2016]
train_2016_df = t_df_dic[2015] 

In [22]:
train_2018_df

Unnamed: 0,id,team,No,name,time_played,position,birthdate,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second
0,2,C大阪,2.0,松田　陸,2524.0,DF,1991-07-24,171.0,69.0,2000,3.0,,,47.0,3.0,42.0,2.0,,,,,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.000000,0.352941,0.0,0.088235,0.0,0.264706,0.000000,0.205882,0.0,0.029412,0.0,0.294118,0.000000,0.147059,0.0,0.058824,0.0
1,5,C大阪,5.0,田中　裕介,279.0,DF,1986-04-14,181.0,77.0,2700,4.0,,,223.0,10.0,42.0,1.0,,,,,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.000000,0.0,0.470588,0.0,0.029412,0.205882,0.000000,0.0,0.264706,0.0,0.088235,0.205882,0.000000,0.0,0.205882,0.0
2,6,C大阪,6.0,山口　蛍,2970.0,MF,1990-10-06,173.0,72.0,6700,10.0,1.0,,134.0,12.0,58.0,2.0,,,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.000000,0.000000,0.0,0.000000,0.0,0.500000,0.000000,0.000000,0.0,0.000000,0.0,0.441176,0.000000,0.000000,0.0,0.000000,0.0
3,7,C大阪,7.0,水沼　宏太,1671.0,MF,1990-02-22,176.0,72.0,3000,2.0,,,194.0,24.0,50.0,7.0,9.0,3.0,,,鳥栖,FC東京,C大阪,1,1,1,32,17,24,7,1,3,2353,570,1901,2018,0.264706,0.088235,0.352941,0.0,0.088235,0.0,0.029412,0.088235,0.088235,0.0,0.088235,0.0,0.235294,0.000000,0.264706,0.0,0.000000,0.0
4,8,C大阪,8.0,柿谷　曜一朗,1424.0,FW,1990-01-03,177.0,68.0,7200,10.0,1.0,,113.0,39.0,168.0,23.0,,,18.0,5.0,SUI,C大阪,C大阪,9,2,1,4,20,34,1,5,6,-,1635,2918,2018,0.558824,0.000000,0.441176,0.0,0.000000,0.0,0.352941,0.000000,0.147059,0.0,0.000000,0.0,0.205882,0.000000,0.294118,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,739,名古屋,29.0,和泉　竜司,2139.0,MF,1993-11-06,173.0,72.0,1400,3.0,,,14.0,1.0,39.0,1.0,,,,,名古屋,名古屋,名古屋,1,1,2,0,14,39,0,1,1,0,816,3270,2018,0.785714,0.071429,0.071429,0.0,0.000000,0.0,0.404762,0.023810,0.023810,0.0,0.000000,0.0,0.380952,0.047619,0.047619,0.0,0.000000,0.0
460,740,名古屋,30.0,松本　孝平,0.0,FW,1994-07-31,186.0,85.0,480,2.0,,,,,0.0,0.0,,,,,-,-,名古屋,-,-,2,-,-,0,-,-,0,-,-,0,2018,,,,,,,,,,,,,,,,,,
461,742,名古屋,32.0,深堀　隼平,106.0,FW,1998-06-29,178.0,72.0,400,2.0,1.0,,0.0,0.0,2.0,0.0,,,,,-,名古屋,名古屋,-,1,2,-,0,2,-,0,0,-,0,60,2018,0.000000,0.023810,0.023810,0.0,0.071429,0.0,0.000000,0.023810,0.000000,0.0,0.023810,0.0,0.000000,0.000000,0.023810,0.0,0.047619,0.0
462,743,名古屋,33.0,梶山　幹太,0.0,MF,1998-04-24,167.0,63.0,380,2.0,1.0,,0.0,0.0,0.0,0.0,,,,,-,名古屋,名古屋,-,1,1,-,0,0,-,0,0,-,0,0,2018,,,,,,,,,,,,,,,,,,


In [23]:
#欠損値補完・特徴量生成
for df, year in [[test_df, 2019], [train_2018_df, 2018], [train_2017_df, 2017], [train_2016_df, 2016]]:
    df["birthdate"] = pd.to_datetime(df["birthdate"])
    df["age"] = df["birthdate"].apply(lambda x: calculate_age(x, year=year))
    df.drop("birthdate", axis=1, inplace=True)
    
train_df = pd.concat([train_2018_df, train_2017_df, train_2016_df], axis=0).reset_index(drop=True)

for df in [train_df, test_df]:
    df["is_j1_play"] = ~df["j1_total_num_played"].isnull()*1
    df['is_j2_play'] = ~df['j2_total_num_played'].isnull()*1
    df['is_j3_play'] = ~df['j3_total_num_played'].isnull()*1
    df['is_na_play'] = ~df['na_total_num_played'].isnull()*1
    
    df["is_prev3_same_team"] = df.apply(is_same_team, col="prev3_team", axis=1)
    df['is_prev2_same_team'] = df.apply(is_same_team, col='prev2_team', axis=1)
    df['is_prev1_same_team'] = df.apply(is_same_team, col='prev1_team', axis=1)
    
    for colname in ["is_youth", "j1_total_num_played", "j1_total_scores", "j2_total_num_played", "j2_total_scores", "j3_total_num_played", "j3_total_scores", "na_total_num_played", "na_total_scores"]:
        df[colname] = df[colname].fillna(0)
        
    for colname in ["nationality"]:
        df[colname] = df[colname].fillna("japan")
        
    for colname in ['prev3_div', 'prev2_div', 'prev1_div', 
                    'prev3_num_played', 'prev2_num_played', 'prev1_num_played', 
                    'prev3_scores', 'prev2_scores', 'prev1_scores', 
                    'prev3_time_played', 'prev2_time_played', 'prev1_time_played']:
        df[colname] = df[colname].replace('-',0)
        df[colname] = df[colname].fillna(0)
        df[colname] = df[colname].map(int)
        
    for colname in ['rat_full_play', 'rat_out_play',
       'rat_in_play', 'rat_inout_play', 'rat_bench_play', 'rat_susp_play',
       'rat_full_play_first', 'rat_out_play_first', 'rat_in_play_first',
       'rat_inout_play_first', 'rat_bench_play_first', 'rat_susp_play_first',
       'rat_full_play_second', 'rat_out_play_second', 'rat_in_play_second',
       'rat_inout_play_second', 'rat_bench_play_second',
       'rat_susp_play_second']:
        
       # 前年の所属リーグがJ1ないしJ2のチームで、値がnullの選手は、0で補完
        ext_rows = (df["prev1_div"].isin(["1","2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(0)
        
        ext_rows = (~df["prev1_div"].isin(["1", "2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(-999)
        df[colname] = df[colname].map(float)
        
        

In [24]:
test_df.head(2)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,age,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,0,C大阪,1.0,圍　謙太朗,GK,190.0,90.0,750,2.0,0.0,japan,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,FC東京,C大阪,福岡,1,1,2,0,0,26,0,0,0,0,0,2340,2019,0.619048,0.0,0.0,0.0,0.333333,0.0,0.285714,0.0,0.0,0.0,0.190476,0.0,0.333333,0.0,0.0,0.0,0.142857,0.0,27,0,0,1,0,0,1,0
1,1,C大阪,2.0,松田　陸,DF,171.0,69.0,2700,4.0,0.0,japan,76.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,1,1,42,31,29,2,2,0,3733,2662,2524,2019,0.735294,0.0,0.117647,0.0,0.058824,0.0,0.411765,0.0,0.029412,0.0,0.058824,0.0,0.323529,0.0,0.088235,0.0,0.0,0.0,27,1,1,0,0,1,1,1


In [25]:
train_df.head(3)

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,...,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,age,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,C大阪,2.0,松田　陸,2524.0,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,...,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.264706,0.0,0.205882,0.0,0.029412,0.0,0.294118,0.0,0.147059,0.0,0.058824,0.0,26,1,1,0,0,0,1,1
1,5,C大阪,5.0,田中　裕介,279.0,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,...,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.029412,0.205882,0.0,0.0,0.264706,0.0,0.088235,0.205882,0.0,0.0,0.205882,0.0,31,1,1,0,0,1,1,1
2,6,C大阪,6.0,山口　蛍,2970.0,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,...,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.441176,0.0,0.0,0.0,0.0,0.0,27,1,1,0,1,1,1,1


In [26]:
ID = 'id'
TARGET = 'time_played'

train_x = train_df.drop([TARGET], axis=1)
train_y = train_df[TARGET]
test_x = test_df.copy()

all_x = pd.concat([train_x, test_x])

In [27]:
all_x.head(3)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,age,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,C大阪,2.0,松田　陸,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.264706,0.0,0.205882,0.0,0.029412,0.0,0.294118,0.0,0.147059,0.0,0.058824,0.0,26,1,1,0,0,0,1,1
1,5,C大阪,5.0,田中　裕介,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.029412,0.205882,0.0,0.0,0.264706,0.0,0.088235,0.205882,0.0,0.0,0.205882,0.0,31,1,1,0,0,1,1,1
2,6,C大阪,6.0,山口　蛍,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.441176,0.0,0.0,0.0,0.0,0.0,27,1,1,0,1,1,1,1


In [28]:
# 年俸が不明な場合中央値で補完
med_salary = all_x[all_x["salary"] != "-"]["salary"].median(skipna=True)
all_x["salary"] = all_x["salary"].mask(all_x["salary"] == "-", med_salary)
all_x["salary"] = all_x["salary"].map(int)

In [29]:
# カテゴリ変数はラベルエンコーディング
cat_columns = ["position", "nationality"]
for c in cat_columns:
    le = LabelEncoder()
    all_x[c].fillna("", inplace = True)
    le.fit(all_x[c])
    all_x[c] = le.transform(all_x[c])

In [30]:
# チーム名は、複数カラムで同じ値でラベルされるように、辞書作成して変換。ただしここでは出現回数が1回のチーム名は同じ値でラベル
team_dic = {}

tmp_dic = pd.concat([all_x['team'], all_x['prev1_team'], all_x['prev2_team'], all_x['prev3_team']]).value_counts().to_dict()
tmp_list = sorted(tmp_dic.items(), key=lambda x:x[-1])
tmp_list

[('QAT', 1),
 ('清水・千葉', 1),
 ('GER・GER', 1),
 ('UAE', 1),
 ('ソニー仙台FC', 1),
 ('SCO', 1),
 ('G大阪・QAT', 1),
 ('CHN・BRA', 1),
 ('徳島・神戸', 1),
 ('KOR・AUS', 1),
 ('川崎F・名古屋', 1),
 ('FC東京・熊本', 1),
 ('清水・愛媛', 1),
 ('鹿島・山形', 1),
 ('C大阪・SUI', 1),
 ('東京V・広島', 1),
 ('SUI・新潟', 1),
 ('Honda FC', 1),
 ('THA・THA', 1),
 ('湘南・熊本', 1),
 ('湘南・鳥栖', 1),
 ('岐阜・神戸', 1),
 ('徳島・湘南', 1),
 ('BRA・THA', 1),
 ('福岡・神戸', 1),
 ('柏・POR', 1),
 ('CHN・新潟', 1),
 ('名古屋・神戸', 1),
 ('C大阪・鳥栖', 1),
 ('UAE・TUR', 1),
 ('長野・長崎', 1),
 ('札幌・THA', 1),
 ('長崎・新潟', 1),
 ('柏・仙台', 1),
 ('浦和・長崎', 1),
 ('神戸・長崎', 1),
 ('山形・浦和', 1),
 ('FC東京・鳥栖', 1),
 ('FC東京・名古屋', 1),
 ('松本・鳥栖', 1),
 ('水戸・浦和', 1),
 ('FC東京・富山', 1),
 ('鹿島・POR', 1),
 ('山口・G大阪', 1),
 ('FC東京・ENG', 1),
 ('ISR', 1),
 ('横浜FM・千葉', 1),
 ('THA・札幌', 1),
 ('新潟・長崎', 1),
 ('BRA・CYP・BRA', 1),
 ('川崎F・磐田', 1),
 ('FC東京・松本', 1),
 ('湘南・UAE', 1),
 ('神戸・SPA', 1),
 ('千葉・G大阪', 1),
 ('LUX', 1),
 ('POR・RUS', 1),
 ('ND', 1),
 ('FC東京・山口', 1),
 ('名古屋・千葉', 1),
 ('清水・鳥栖', 1),
 ('愛媛・C大阪', 1),
 ('MDA', 1),
 ('広島・A

In [31]:
label = 0
for k,v in tmp_list:
    if v == 1:
        team_dic[k] = label
    else:
        team_dic[k] = label
        label += 1
        
team_dic

{'QAT': 0,
 '清水・千葉': 0,
 'GER・GER': 0,
 'UAE': 0,
 'ソニー仙台FC': 0,
 'SCO': 0,
 'G大阪・QAT': 0,
 'CHN・BRA': 0,
 '徳島・神戸': 0,
 'KOR・AUS': 0,
 '川崎F・名古屋': 0,
 'FC東京・熊本': 0,
 '清水・愛媛': 0,
 '鹿島・山形': 0,
 'C大阪・SUI': 0,
 '東京V・広島': 0,
 'SUI・新潟': 0,
 'Honda FC': 0,
 'THA・THA': 0,
 '湘南・熊本': 0,
 '湘南・鳥栖': 0,
 '岐阜・神戸': 0,
 '徳島・湘南': 0,
 'BRA・THA': 0,
 '福岡・神戸': 0,
 '柏・POR': 0,
 'CHN・新潟': 0,
 '名古屋・神戸': 0,
 'C大阪・鳥栖': 0,
 'UAE・TUR': 0,
 '長野・長崎': 0,
 '札幌・THA': 0,
 '長崎・新潟': 0,
 '柏・仙台': 0,
 '浦和・長崎': 0,
 '神戸・長崎': 0,
 '山形・浦和': 0,
 'FC東京・鳥栖': 0,
 'FC東京・名古屋': 0,
 '松本・鳥栖': 0,
 '水戸・浦和': 0,
 'FC東京・富山': 0,
 '鹿島・POR': 0,
 '山口・G大阪': 0,
 'FC東京・ENG': 0,
 'ISR': 0,
 '横浜FM・千葉': 0,
 'THA・札幌': 0,
 '新潟・長崎': 0,
 'BRA・CYP・BRA': 0,
 '川崎F・磐田': 0,
 'FC東京・松本': 0,
 '湘南・UAE': 0,
 '神戸・SPA': 0,
 '千葉・G大阪': 0,
 'LUX': 0,
 'POR・RUS': 0,
 'ND': 0,
 'FC東京・山口': 0,
 '名古屋・千葉': 0,
 '清水・鳥栖': 0,
 '愛媛・C大阪': 0,
 'MDA': 0,
 '広島・AUS': 0,
 'FC東京・横浜FM': 0,
 '新潟・山口': 0,
 'CRO': 0,
 'BRA・甲府': 0,
 '相模原': 0,
 'JAPANサッカーカレッジ': 0,
 'FC東京・AUT': 0,
 '広島・岡山': 0,
 '川

In [32]:
colnames = ['team', 'prev1_team', 'prev2_team', 'prev3_team']
for colname in colnames:
    all_x[colname] = all_x[colname].map(team_dic)

pd.set_option("display.max_columns",61)
all_x.head()

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,age,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,130,2.0,松田　陸,0,171.0,69.0,2000,3.0,0.0,14,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,140,130,130,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.264706,0.0,0.205882,0.0,0.029412,0.0,0.294118,0.0,0.147059,0.0,0.058824,0.0,26,1,1,0,0,0,1,1
1,5,130,5.0,田中　裕介,0,181.0,77.0,2700,4.0,0.0,14,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,130,130,130,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.029412,0.205882,0.0,0.0,0.264706,0.0,0.088235,0.205882,0.0,0.0,0.205882,0.0,31,1,1,0,0,1,1,1
2,6,130,6.0,山口　蛍,3,173.0,72.0,6700,10.0,1.0,14,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,14,130,130,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.441176,0.0,0.0,0.0,0.0,0.0,27,1,1,0,1,1,1,1
3,7,130,7.0,水沼　宏太,3,176.0,72.0,3000,2.0,0.0,14,194.0,24.0,50.0,7.0,9.0,3.0,0.0,0.0,136,140,130,1,1,1,32,17,24,7,1,3,2353,570,1901,2018,0.264706,0.088235,0.352941,0.0,0.088235,0.0,0.029412,0.088235,0.088235,0.0,0.088235,0.0,0.235294,0.0,0.264706,0.0,0.0,0.0,28,1,1,1,0,0,0,1
4,8,130,8.0,柿谷　曜一朗,1,177.0,68.0,7200,10.0,1.0,14,113.0,39.0,168.0,23.0,0.0,0.0,18.0,5.0,48,130,130,9,2,1,4,20,34,1,5,6,0,1635,2918,2018,0.558824,0.0,0.441176,0.0,0.0,0.0,0.352941,0.0,0.147059,0.0,0.0,0.0,0.205882,0.0,0.294118,0.0,0.0,0.0,28,1,1,0,1,0,1,1


In [33]:
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [34]:
# 学習
remove_cols = [ID, TARGET, 'name', 'year']
feature_cols = [col for col in list(train_x) if col not in remove_cols]
cat_cols = ['team', 'prev1_team', 'prev2_team', 'prev3_team', 'position', 'nationality']
SEED = 0

# train, validationのsplitはyearで層化抽出する
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y, test_size=0.2, random_state=SEED, stratify=train_x['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']


Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 397.574	valid's rmse: 802.574
Early stopping, best iteration is:
[30]	train's rmse: 613.303	valid's rmse: 773.598
rmse: 773.5982


In [17]:
tt_pred = model.predict(tt_x, num_iteration=model.best_iteration)
test_df[TARGET] = tt_pred

sub_df = pd.read_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/sample_submission.csv')

sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)
sub_df.to_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/tutorial_submission.csv', index=False)

NameError: name 'model' is not defined