In [101]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb


In [102]:
def calculate_age(born, year):
    """year年4月1日時点での年齢を返す"""
    date = datetime(year,4,1)
    return date.year - born.year - ((date.month, date.day) < (born.month, born.day))

def is_same_team(df, col):
    return (df["team"] in df[col])*1

In [103]:
# データ読み込み
train_2018_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2018.csv')
train_2017_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2017.csv')
train_2016_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2016.csv')

train_2018_df['year'] = 2018
train_2017_df['year'] = 2017
train_2016_df['year'] = 2016

test_df = pd.read_csv("/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/test.csv")
test_df["year"] = 2019

t_df_dic = {2018: test_df, 2017: train_2018_df, 2016: train_2017_df, 2015: train_2016_df}

for year in [2018,2017,2016,2015]:
    t_df = t_df_dic[year]
    t_df["name-team"] = t_df["name"] + "-" + t_df["prev1_team"].apply(lambda x: x.split("・")[-1])
    
    eve_pla_df = pd.read_csv(f"/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/event_play_{year}_4div改.csv")
    
    for df in [eve_pla_df]:
        df["name-team"] = df["name"] + "-" + df["team"]
        df.drop(["name", "team"], axis=1, inplace=True)
        t_df = pd.merge(t_df, df, on="name-team", how="left")
        
    t_df.drop(["name-team"], axis=1, inplace=True)
    t_df_dic[year] = t_df
    
test_df = t_df_dic[2018]
train_2018_df = t_df_dic[2017]
train_2017_df = t_df_dic[2016]
train_2016_df = t_df_dic[2015] 

In [104]:
pd.set_option("display.max_columns",1000)
train_2018_df

Unnamed: 0,id,team,No,name,time_played,position,birthdate,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth
0,2,C大阪,2.0,松田　陸,2524.0,DF,1991-07-24,171.0,69.0,2000,3.0,,,47.0,3.0,42.0,2.0,,,,,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.000000,0.352941,0.0,0.088235,0.0,0.147059,0.000000,0.088235,0.0,0.000000,0.0,0.088235,0.000000,0.117647,0.0,0.029412,0.0,0.147059,0.000000,0.029412,0.0,0.058824,0.0,0.176471,0.000000,0.117647,0.0,0.000000,0.0
1,5,C大阪,5.0,田中　裕介,279.0,DF,1986-04-14,181.0,77.0,2700,4.0,,,223.0,10.0,42.0,1.0,,,,,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.000000,0.0,0.470588,0.0,0.000000,0.088235,0.000000,0.0,0.147059,0.0,0.029412,0.117647,0.000000,0.0,0.088235,0.0,0.058824,0.058824,0.000000,0.0,0.117647,0.0,0.029412,0.147059,0.000000,0.0,0.117647,0.0
2,6,C大阪,6.0,山口　蛍,2970.0,MF,1990-10-06,173.0,72.0,6700,10.0,1.0,,134.0,12.0,58.0,2.0,,,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.000000,0.000000,0.0,0.000000,0.0,0.235294,0.000000,0.000000,0.0,0.000000,0.0,0.235294,0.000000,0.000000,0.0,0.000000,0.0,0.235294,0.000000,0.000000,0.0,0.000000,0.0,0.235294,0.000000,0.000000,0.0,0.000000,0.0
3,7,C大阪,7.0,水沼　宏太,1671.0,MF,1990-02-22,176.0,72.0,3000,2.0,,,194.0,24.0,50.0,7.0,9.0,3.0,,,鳥栖,FC東京,C大阪,1,1,1,32,17,24,7,1,3,2353,570,1901,2018,0.264706,0.088235,0.352941,0.0,0.088235,0.0,0.000000,0.000000,0.029412,0.0,0.088235,0.0,0.029412,0.088235,0.029412,0.0,0.000000,0.0,0.088235,0.000000,0.147059,0.0,0.000000,0.0,0.147059,0.000000,0.147059,0.0,0.000000,0.0
4,8,C大阪,8.0,柿谷　曜一朗,1424.0,FW,1990-01-03,177.0,68.0,7200,10.0,1.0,,113.0,39.0,168.0,23.0,,,18.0,5.0,SUI,C大阪,C大阪,9,2,1,4,20,34,1,5,6,-,1635,2918,2018,0.558824,0.000000,0.441176,0.0,0.000000,0.0,0.176471,0.000000,0.058824,0.0,0.000000,0.0,0.176471,0.000000,0.058824,0.0,0.000000,0.0,0.029412,0.000000,0.205882,0.0,0.000000,0.0,0.176471,0.000000,0.117647,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
459,739,名古屋,29.0,和泉　竜司,2139.0,MF,1993-11-06,173.0,72.0,1400,3.0,,,14.0,1.0,39.0,1.0,,,,,名古屋,名古屋,名古屋,1,1,2,0,14,39,0,1,1,0,816,3270,2018,0.785714,0.071429,0.071429,0.0,0.000000,0.0,0.166667,0.023810,0.000000,0.0,0.000000,0.0,0.214286,0.000000,0.023810,0.0,0.000000,0.0,0.142857,0.047619,0.023810,0.0,0.000000,0.0,0.261905,0.000000,0.023810,0.0,0.000000,0.0
460,740,名古屋,30.0,松本　孝平,0.0,FW,1994-07-31,186.0,85.0,480,2.0,,,,,0.0,0.0,,,,,-,-,名古屋,-,-,2,-,-,0,-,-,0,-,-,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
461,742,名古屋,32.0,深堀　隼平,106.0,FW,1998-06-29,178.0,72.0,400,2.0,1.0,,0.0,0.0,2.0,0.0,,,,,-,名古屋,名古屋,-,1,2,-,0,2,-,0,0,-,0,60,2018,0.000000,0.023810,0.023810,0.0,0.071429,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.023810,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.023810,0.0,0.047619,0.0,0.000000,0.000000,0.000000,0.0,0.023810,0.0
462,743,名古屋,33.0,梶山　幹太,0.0,MF,1998-04-24,167.0,63.0,380,2.0,1.0,,0.0,0.0,0.0,0.0,,,,,-,名古屋,名古屋,-,1,1,-,0,0,-,0,0,-,0,0,2018,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [105]:
# チームIDをチーム名に変換する辞書作成
team_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/team.csv')
team_dic = dict(zip(team_df['team_id'], team_df['team_name']))
team_id_list = [v for v in team_dic.values()]

#欠損値補完・特徴量生成
for df, year in [[test_df, 2019], [train_2018_df, 2018], [train_2017_df, 2017], [train_2016_df, 2016]]:
    df["birthdate"] = pd.to_datetime(df["birthdate"])
    df["age"] = df["birthdate"].apply(lambda x: calculate_age(x, year=year))
    df.drop("birthdate", axis=1, inplace=True)
    
    # 年俸が不明な場合中央値で補完
    med_salary = df[df["salary"] != "-"]["salary"].median(skipna=True)
    df["salary"] = df["salary"].mask(df["salary"] == "-", med_salary)
    df["salary"] = df["salary"].map(int)
    
    #チームIDを一時的に追加
    df["team_id"] = df["team"]
    df["team_id"] = df["team_id"].replace(team_dic)
    
    #年俸をチーム毎に標準化
    df["salary_std"] = df["salary"].copy()
    for i in team_id_list:
        s = df.loc[df["team_id"] == i, "salary"].sum()
        df.loc[df["team_id"] == i, 'salary_std'] /= s
        df.loc[df["team_id"] == i, 'salary_std'] *=100
    df["salary_std"] = df["salary_std"].map(float)
    
train_df = pd.concat([train_2018_df, train_2017_df, train_2016_df], axis=0).reset_index(drop=True)
train_df.drop(["team_id"], axis=1, inplace=True)
test_df.drop(["team_id"], axis=1, inplace=True)

for df in [train_df, test_df]:
    df["is_j1_play"] = ~df["j1_total_num_played"].isnull()*1
    df['is_j2_play'] = ~df['j2_total_num_played'].isnull()*1
    df['is_j3_play'] = ~df['j3_total_num_played'].isnull()*1
    df['is_na_play'] = ~df['na_total_num_played'].isnull()*1
    
    df["is_prev3_same_team"] = df.apply(is_same_team, col="prev3_team", axis=1)
    df['is_prev2_same_team'] = df.apply(is_same_team, col='prev2_team', axis=1)
    df['is_prev1_same_team'] = df.apply(is_same_team, col='prev1_team', axis=1)
    
    for colname in ["is_youth", "j1_total_num_played", "j1_total_scores", "j2_total_num_played", "j2_total_scores", "j3_total_num_played", "j3_total_scores", "na_total_num_played", "na_total_scores"]:
        df[colname] = df[colname].fillna(0)
        
    for colname in ["nationality"]:
        df[colname] = df[colname].fillna("japan")
        
    for colname in ['prev3_div', 'prev2_div', 'prev1_div', 
                    'prev3_num_played', 'prev2_num_played', 'prev1_num_played', 
                    'prev3_scores', 'prev2_scores', 'prev1_scores', 
                    'prev3_time_played', 'prev2_time_played', 'prev1_time_played']:
        df[colname] = df[colname].replace('-',0)
        df[colname] = df[colname].fillna(0)
        df[colname] = df[colname].map(int)
        
    for colname in ['rat_full_play', 'rat_out_play',
       'rat_in_play', 'rat_inout_play', 'rat_bench_play', 'rat_susp_play',
       'rat_full_play_first', 'rat_out_play_first', 'rat_in_play_first',
       'rat_inout_play_first', 'rat_bench_play_first', 'rat_susp_play_first',
       'rat_full_play_second', 'rat_out_play_second', 'rat_in_play_second',
       'rat_inout_play_second', 'rat_bench_play_second',
       'rat_susp_play_second',  'rat_full_play_third', 'rat_out_play_third', 'rat_in_play_third',
       'rat_inout_play_third', 'rat_bench_play_third',
       'rat_susp_play_third', 'rat_full_play_forth', 'rat_out_play_forth', 'rat_in_play_forth',
       'rat_inout_play_forth', 'rat_bench_play_forth',
       'rat_susp_play_forth']:
        
       # 前年の所属リーグがJ1ないしJ2のチームで、値がnullの選手は、0で補完
        ext_rows = (df["prev1_div"].isin(["1","2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(0)
        
        ext_rows = (~df["prev1_div"].isin(["1", "2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(-999)
        df[colname] = df[colname].map(float)
        
        

In [106]:
train_df["period enrollment"] = train_df["is_prev3_same_team"] + train_df["is_prev2_same_team"] + train_df["is_prev1_same_team"]
test_df["period enrollment"] = test_df["is_prev3_same_team"] + test_df["is_prev2_same_team"] + test_df["is_prev1_same_team"]

In [107]:
new_train_df = train_df[["time_played", "period enrollment"]]
new_train_df

Unnamed: 0,time_played,period enrollment
0,2524.0,2
1,279.0,3
2,2970.0,3
3,1671.0,1
4,1424.0,2
...,...,...
1409,816.0,1
1410,545.0,2
1411,1930.0,0
1412,9.0,2


In [108]:
new_train_df_plot = new_train_df.groupby(["period enrollment"]).mean()
print(new_train_df_plot)
new_train_df_plot = new_train_df.groupby(["period enrollment"]).median()
print(new_train_df_plot)

                   time_played
period enrollment             
0                  1159.217544
1                   992.011396
2                  1160.451852
3                  1357.352362
                   time_played
period enrollment             
0                       1039.0
1                        600.0
2                        897.0
3                       1289.0


In [109]:
#在籍年数ごとにデータを分ける
train_df_p3 = train_df.loc[train_df["period enrollment"]==3]
train_df_p2 = train_df.loc[train_df["period enrollment"]==2]
train_df_p1 = train_df.loc[train_df["period enrollment"]==1]
train_df_p0 = train_df.loc[train_df["period enrollment"]==0]
train_df_p0.head(50)

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,period enrollment
8,13,C大阪,13.0,高木　俊幸,1868.0,FW,170.0,64.0,2000,1.0,0.0,japan,167.0,24.0,30.0,6.0,0.0,0.0,0.0,0.0,浦和,浦和,浦和,1,1,1,21,14,12,2,2,0,881,829,591,2018,0.0,0.176471,0.176471,0.0,0.176471,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.029412,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.088235,0.0,0.0,0.088235,0.147059,0.0,0.058824,0.0,26,2.354326,1,1,0,0,0,0,0,0
11,18,C大阪,18.0,ヤン　ドンヒョン,849.0,FW,186.0,80.0,6500,1.0,0.0,korea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,KOR,KOR,KOR,9,9,9,30,32,36,8,13,19,0,0,0,2018,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,32,7.65156,0,0,0,0,0,0,0,0
17,32,C大阪,32.0,田中　亜土夢,228.0,MF,167.0,68.0,4000,1.0,0.0,japan,200.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,FIN,FIN,FIN,9,9,9,31,17,33,8,5,7,0,0,0,2018,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,30,4.708652,1,0,0,0,0,0,0,0
26,70,FC東京,9.0,ディエゴ　オリヴェイラ,2616.0,FW,179.0,78.0,5300,1.0,0.0,brazil,57.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,BRA・BRA,柏,柏,9,1,1,46,30,27,8,12,5,0,2482,1331,2018,0.264706,0.411765,0.117647,0.0,0.029412,0.029412,0.176471,0.029412,0.0,0.0,0.0,0.0,0.0,0.205882,0.0,0.0,0.029412,0.0,0.0,0.088235,0.058824,0.0,0.0,0.029412,0.088235,0.088235,0.058824,0.0,0.0,0.0,27,6.5904,1,0,0,0,0,0,0,0
30,75,FC東京,17.0,富樫　敬真,573.0,FW,178.0,75.0,1400,1.0,0.0,japan,38.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,横浜FM,横浜FM,横浜FM,1,1,1,4,18,16,1,5,2,59,868,563,2018,0.0,0.294118,0.176471,0.0,0.205882,0.0,0.0,0.0,0.058824,0.0,0.058824,0.0,0.0,0.088235,0.0,0.0,0.0,0.0,0.0,0.117647,0.088235,0.0,0.029412,0.0,0.0,0.088235,0.029412,0.0,0.117647,0.0,24,1.74086,1,0,0,0,0,0,0,0
38,93,FC東京,39.0,大森　晃太郎,2388.0,MF,170.0,70.0,4000,1.0,0.0,japan,110.0,15.0,16.0,1.0,2.0,0.0,0.0,0.0,G大阪,G大阪,神戸,1,1,1,30,25,26,3,3,4,1640,1377,1704,2018,0.235294,0.147059,0.382353,0.0,0.058824,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.0,0.0,0.0,0.029412,0.088235,0.0,0.029412,0.0,0.0,0.117647,0.088235,0.0,0.029412,0.0,25,4.973887,1,1,1,0,0,0,0,0
52,135,G大阪,13.0,菅沼　駿哉,843.0,DF,182.0,78.0,2200,4.0,1.0,japan,37.0,2.0,147.0,2.0,0.0,0.0,0.0,0.0,京都,京都,山形,2,2,2,34,39,40,1,1,0,3060,3501,3577,2018,0.928571,0.0,0.02381,0.0,0.0,0.02381,0.238095,0.0,0.0,0.0,0.0,0.0,0.238095,0.0,0.0,0.0,0.0,0.0,0.190476,0.0,0.02381,0.0,0.0,0.0,0.261905,0.0,0.0,0.0,0.0,0.02381,27,2.466368,1,1,0,0,0,0,0,0
58,143,G大阪,21.0,矢島　慎也,75.0,MF,171.0,67.0,2600,1.0,0.0,japan,23.0,2.0,72.0,13.0,3.0,1.0,0.0,0.0,岡山,岡山,浦和,2,2,1,37,35,11,8,5,1,2709,3101,591,2018,0.058824,0.117647,0.147059,0.0,0.264706,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.029412,0.0,0.0,0.088235,0.0,0.0,0.058824,0.029412,0.0,0.029412,0.0,0.058824,0.029412,0.117647,0.0,0.088235,0.0,24,2.914798,1,1,1,0,0,0,0,0
69,175,浦和,7.0,武富　孝介,302.0,MF,173.0,69.0,2900,1.0,0.0,japan,101.0,17.0,110.0,25.0,0.0,0.0,0.0,0.0,柏,柏,柏,1,1,1,28,26,26,2,3,9,2019,1804,1522,2018,0.147059,0.235294,0.352941,0.029412,0.058824,0.0,0.029412,0.058824,0.058824,0.0,0.0,0.0,0.058824,0.0,0.147059,0.0,0.0,0.0,0.058824,0.058824,0.058824,0.0,0.058824,0.0,0.0,0.117647,0.088235,0.029412,0.0,0.0,27,2.467245,1,1,0,0,0,0,0,0
72,178,浦和,11.0,マルティノス,446.0,MF,183.0,70.0,7000,1.0,0.0,curacao,53.0,9.0,0.0,0.0,0.0,0.0,6.0,1.0,ROU,横浜FM,横浜FM,9,1,1,21,24,29,4,4,5,0,1691,2375,2018,0.5,0.058824,0.294118,0.0,0.0,0.088235,0.088235,0.0,0.117647,0.0,0.0,0.029412,0.205882,0.0,0.029412,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.0,0.0,0.117647,0.058824,0.029412,0.0,0.0,0.058824,27,5.955419,1,0,0,1,0,0,0,0


In [110]:
#train_df_p0を更に分割
train_df_p0d1 = train_df_p0.loc[train_df_p0["prev1_div"]==1]
train_df_p0d2 = train_df_p0.loc[train_df_p0["prev1_div"]==2]
train_df_p0d3 = train_df_p0.loc[train_df_p0["prev1_div"]==3]
train_df_p0d4 = train_df_p0.loc[train_df_p0["prev1_div"]==4]
train_df_p0d9 = train_df_p0.loc[train_df_p0["prev1_div"]==9]
train_df_p0d0 = train_df_p0.loc[train_df_p0["prev1_div"]==0]

In [126]:
print(train_df_p3.shape)
print(train_df_p2.shape)
print(train_df_p1.shape)
print(train_df_p0.shape)
print(train_df_p0d1.shape)
print(train_df_p0d2.shape)
print(train_df_p0d3.shape)
print(train_df_p0d4.shape)
print(train_df_p0d9.shape)
print(train_df_p0d0.shape)

(508, 76)
(270, 76)
(351, 76)
(285, 76)
(130, 76)
(64, 76)
(4, 76)
(1, 76)
(53, 76)
(33, 76)


In [127]:
test_df_p3 = test_df.loc[test_df["period enrollment"]==3]
test_df_p2 = test_df.loc[test_df["period enrollment"]==2]
test_df_p1 = test_df.loc[test_df["period enrollment"]==1]
test_df_p0 = test_df.loc[test_df["period enrollment"]==0]

print(test_df_p3.shape)
print(test_df_p2.shape)
print(test_df_p1.shape)
print(test_df_p0.shape)

(155, 75)
(109, 75)
(113, 75)
(111, 75)


In [112]:
train_2018_df.head()

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std
0,2,C大阪,2.0,松田　陸,2524.0,DF,171.0,69.0,2000,3.0,,,47.0,3.0,42.0,2.0,,,,,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,C大阪,2.354326
1,5,C大阪,5.0,田中　裕介,279.0,DF,181.0,77.0,2700,4.0,,,223.0,10.0,42.0,1.0,,,,,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,C大阪,3.17834
2,6,C大阪,6.0,山口　蛍,2970.0,MF,173.0,72.0,6700,10.0,1.0,,134.0,12.0,58.0,2.0,,,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,C大阪,7.886992
3,7,C大阪,7.0,水沼　宏太,1671.0,MF,176.0,72.0,3000,2.0,,,194.0,24.0,50.0,7.0,9.0,3.0,,,鳥栖,FC東京,C大阪,1,1,1,32,17,24,7,1,3,2353,570,1901,2018,0.264706,0.088235,0.352941,0.0,0.088235,0.0,0.0,0.0,0.029412,0.0,0.088235,0.0,0.029412,0.088235,0.029412,0.0,0.0,0.0,0.088235,0.0,0.147059,0.0,0.0,0.0,0.147059,0.0,0.147059,0.0,0.0,0.0,28,C大阪,3.531489
4,8,C大阪,8.0,柿谷　曜一朗,1424.0,FW,177.0,68.0,7200,10.0,1.0,,113.0,39.0,168.0,23.0,,,18.0,5.0,SUI,C大阪,C大阪,9,2,1,4,20,34,1,5,6,-,1635,2918,2018,0.558824,0.0,0.441176,0.0,0.0,0.0,0.176471,0.0,0.058824,0.0,0.0,0.0,0.176471,0.0,0.058824,0.0,0.0,0.0,0.029412,0.0,0.205882,0.0,0.0,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,28,C大阪,8.475574


In [113]:
test_df.head(2)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,period enrollment
0,0,C大阪,1.0,圍　謙太朗,GK,190.0,90.0,750,2.0,0.0,japan,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,FC東京,C大阪,福岡,1,1,2,0,0,26,0,0,0,0,0,2340,2019,0.619048,0.0,0.0,0.0,0.333333,0.0,0.02381,0.0,0.0,0.0,0.190476,0.0,0.238095,0.0,0.0,0.0,0.0,0.0,0.214286,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.0,27,0.668568,0,0,1,0,0,1,0,1
1,1,C大阪,2.0,松田　陸,DF,171.0,69.0,2700,4.0,0.0,japan,76.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,1,1,42,31,29,2,2,0,3733,2662,2524,2019,0.735294,0.0,0.117647,0.0,0.058824,0.0,0.205882,0.0,0.0,0.0,0.029412,0.0,0.205882,0.0,0.029412,0.0,0.0,0.0,0.176471,0.0,0.029412,0.0,0.029412,0.0,0.147059,0.0,0.058824,0.0,0.0,0.0,27,2.406846,1,1,0,0,1,1,1,3


In [114]:
pd.set_option("display.max_columns",1000)
train_df.head(3)

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,period enrollment
0,2,C大阪,2.0,松田　陸,2524.0,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,2.354326,1,1,0,0,0,1,1,2
1,5,C大阪,5.0,田中　裕介,279.0,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,3.17834,1,1,0,0,1,1,1,3
2,6,C大阪,6.0,山口　蛍,2970.0,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,7.886992,1,1,0,1,1,1,1,3


In [133]:
ID = 'id'
TARGET = 'time_played'

train_x = train_df.drop([TARGET], axis=1)
train_y = train_df[TARGET]
test_x = test_df.copy()

all_x = pd.concat([train_x, test_x])

# カテゴリ変数はラベルエンコーディング
cat_columns = ["position", "nationality"]
for c in cat_columns:
    le = LabelEncoder()
    all_x[c].fillna("", inplace = True)
    le.fit(all_x[c])
    all_x[c] = le.transform(all_x[c])
    
# チーム名は、複数カラムで同じ値でラベルされるように、辞書作成して変換。ただしここでは出現回数が1回のチーム名は同じ値でラベル
team_dic = {}

tmp_dic = pd.concat([all_x['team'], all_x['prev1_team'], all_x['prev2_team'], all_x['prev3_team']]).value_counts().to_dict()
tmp_list = sorted(tmp_dic.items(), key=lambda x:x[-1])

label = 0
for k,v in tmp_list:
    if v == 1:
        team_dic[k] = label
    else:
        team_dic[k] = label
        label += 1
        

colnames = ['team', 'prev1_team', 'prev2_team', 'prev3_team']
for colname in colnames:
    all_x[colname] = all_x[colname].map(team_dic)

pd.set_option("display.max_columns",61)

train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

#学習用にデータを四種類に分ける
train_x_p3 = train_x.loc[train_x["period enrollment"]==3]
train_x_p2 = train_x.loc[train_x["period enrollment"]==2]
train_x_p1 = train_x.loc[train_x["period enrollment"]==1]
train_x_p0 = train_x.loc[train_x["period enrollment"]==0]

train_y_p3 = train_df_p3[TARGET]
train_y_p2 = train_df_p2[TARGET]
train_y_p1 = train_df_p1[TARGET]
train_y_p0 = train_df_p0[TARGET]

# 学習の前準備
remove_cols = [ID, TARGET, 'name', 'year']
feature_cols = [col for col in list(train_x) if col not in remove_cols]
cat_cols = ['team', 'prev1_team', 'prev2_team', 'prev3_team', 'position', 'nationality']
SEED = 0

#学習スタート
# train, validationのsplitはyearで層化抽出する
#model3
tr_x, va_x, tr_y, va_y = train_test_split(train_x_p3, train_y_p3, test_size=0.2, random_state=SEED, stratify=train_x_p3['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x3 = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model3 = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

#model2
tr_x, va_x, tr_y, va_y = train_test_split(train_x_p2, train_y_p2, test_size=0.2, random_state=SEED, stratify=train_x_p2['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x2 = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model2 = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

#model1
tr_x, va_x, tr_y, va_y = train_test_split(train_x_p1, train_y_p1, test_size=0.2, random_state=SEED, stratify=train_x_p1['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x1 = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model1 = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

#model0
tr_x, va_x, tr_y, va_y = train_test_split(train_x_p0, train_y_p0, test_size=0.2, random_state=SEED, stratify=train_x_p0['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x0 = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model0 = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']


Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 189.731	valid's rmse: 869.073
Early stopping, best iteration is:
[17]	train's rmse: 552.521	valid's rmse: 805.919
rmse: 694.6452


New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']
New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']


Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 146.624	valid's rmse: 876.411
Early stopping, best iteration is:
[21]	train's rmse: 510.344	valid's rmse: 826.368
rmse: 606.4086
Training until validation scores don't improve for 100 rounds


New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']


[100]	train's rmse: 192.619	valid's rmse: 703.941
Early stopping, best iteration is:
[33]	train's rmse: 414.896	valid's rmse: 684.655
rmse: 563.6259
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 223.649	valid's rmse: 882.906
Early stopping, best iteration is:
[18]	train's rmse: 635.88	valid's rmse: 843.06
rmse: 658.5035


In [134]:
sub_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/sample_submission.csv')

tt_pred3 = model3.predict(tt_x3, num_iteration=model3.best_iteration)
test_df[TARGET] = tt_pred3
sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)

tt_pred2 = model2.predict(tt_x2, num_iteration=model2.best_iteration)
test_df[TARGET] = tt_pred2
sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)

tt_pred1 = model1.predict(tt_x1, num_iteration=model1.best_iteration)
test_df[TARGET] = tt_pred1
sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)

tt_pred0 = model0.predict(tt_x0, num_iteration=model0.best_iteration)
test_df[TARGET] = tt_pred0
sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)

sub_df.to_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/モデル４つ_submission.csv', index=False)