In [54]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold, GroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb


In [55]:
def calculate_age(born, year):
    """year年4月1日時点での年齢を返す"""
    date = datetime(year,4,1)
    return date.year - born.year - ((date.month, date.day) < (born.month, born.day))

def is_same_team(df, col):
    return (df["team"] in df[col])*1

In [56]:
# データ読み込み
train_2018_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2018.csv')
train_2017_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2017.csv')
train_2016_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/train_2016.csv')

train_2018_df['year'] = 2018
train_2017_df['year'] = 2017
train_2016_df['year'] = 2016

test_df = pd.read_csv("/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/test.csv")
test_df["year"] = 2019

t_df_dic = {2018: test_df, 2017: train_2018_df, 2016: train_2017_df, 2015: train_2016_df}

for year in [2018,2017,2016,2015]:
    t_df = t_df_dic[year]
    t_df["name-team"] = t_df["name"] + "-" + t_df["prev1_team"].apply(lambda x: x.split("・")[-1])
    
    eve_pla_df = pd.read_csv(f"/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/event_play_{year}_4div改.csv")
    
    for df in [eve_pla_df]:
        df["name-team"] = df["name"] + "-" + df["team"]
        df.drop(["name", "team"], axis=1, inplace=True)
        t_df = pd.merge(t_df, df, on="name-team", how="left")
        
    t_df.drop(["name-team"], axis=1, inplace=True)
    t_df_dic[year] = t_df
    
test_df = t_df_dic[2018]
train_2018_df = t_df_dic[2017]
train_2017_df = t_df_dic[2016]
train_2016_df = t_df_dic[2015] 

In [57]:
# チームIDをチーム名に変換する辞書作成
team_df = pd.read_csv('/Users/onehe/Desktop/プログラミング/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/team.csv')
team_dic = dict(zip(team_df['team_id'], team_df['team_name']))
team_id_list = [v for v in team_dic.values()]
team_id_list_swap = [k for k in team_dic.keys()]

#欠損値補完・特徴量生成
test_df["time_played"] = 0
for df, year in [[test_df, 2019], [train_2018_df, 2018], [train_2017_df, 2017], [train_2016_df, 2016]]:
    df["birthdate"] = pd.to_datetime(df["birthdate"])
    df["age"] = df["birthdate"].apply(lambda x: calculate_age(x, year=year))
    df.drop("birthdate", axis=1, inplace=True)
    
    # 年俸が不明な場合中央値で補完
    med_salary = df[df["salary"] != "-"]["salary"].median(skipna=True)
    df["salary"] = df["salary"].mask(df["salary"] == "-", med_salary)
    df["salary"] = df["salary"].map(int)
    
    #チームIDを一時的に追加
    df["team_id"] = df["team"]
    df["team_id"] = df["team_id"].replace(team_dic)
    
    #年俸をチーム毎に標準化
    df["salary_std"] = df["salary"].copy()
    for i in team_id_list:
        s = df.loc[df["team_id"] == i, "salary"].sum()
        df.loc[df["team_id"] == i, 'salary_std'] /= s
        df.loc[df["team_id"] == i, 'salary_std'] *=100
    df["salary_std"] = df["salary_std"].map(float)
      
    
train_df = pd.concat([train_2018_df, train_2017_df, train_2016_df], axis=0).reset_index(drop=True)
#train_df.drop(["team_id"], axis=1, inplace=True)
#test_df.drop(["team_id"], axis=1, inplace=True)
test_df.drop(["time_played"], axis=1, inplace=True)

for df in [train_df, test_df]:
    df["is_j1_play"] = ~df["j1_total_num_played"].isnull()*1
    df['is_j2_play'] = ~df['j2_total_num_played'].isnull()*1
    df['is_j3_play'] = ~df['j3_total_num_played'].isnull()*1
    df['is_na_play'] = ~df['na_total_num_played'].isnull()*1
    
    df["is_prev3_same_team"] = df.apply(is_same_team, col="prev3_team", axis=1)
    df['is_prev2_same_team'] = df.apply(is_same_team, col='prev2_team', axis=1)
    df['is_prev1_same_team'] = df.apply(is_same_team, col='prev1_team', axis=1)
    
    for colname in ["is_youth", "j1_total_num_played", "j1_total_scores", "j2_total_num_played", "j2_total_scores", "j3_total_num_played", "j3_total_scores", "na_total_num_played", "na_total_scores"]:
        df[colname] = df[colname].fillna(0)
        
    for colname in ["nationality"]:
        df[colname] = df[colname].fillna("japan")
        
    for colname in ['prev3_div', 'prev2_div', 'prev1_div', 
                    'prev3_num_played', 'prev2_num_played', 'prev1_num_played', 
                    'prev3_scores', 'prev2_scores', 'prev1_scores', 
                    'prev3_time_played', 'prev2_time_played', 'prev1_time_played']:
        df[colname] = df[colname].replace('-',0)
        df[colname] = df[colname].fillna(0)
        df[colname] = df[colname].map(int)
        
    for colname in ['rat_full_play', 'rat_out_play',
       'rat_in_play', 'rat_inout_play', 'rat_bench_play', 'rat_susp_play',
       'rat_full_play_first', 'rat_out_play_first', 'rat_in_play_first',
       'rat_inout_play_first', 'rat_bench_play_first', 'rat_susp_play_first',
       'rat_full_play_second', 'rat_out_play_second', 'rat_in_play_second',
       'rat_inout_play_second', 'rat_bench_play_second',
       'rat_susp_play_second']:
        
       # 前年の所属リーグがJ1ないしJ2のチームで、値がnullの選手は、0で補完
        ext_rows = (df["prev1_div"].isin(["1","2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(0)
        
        ext_rows = (~df["prev1_div"].isin(["1", "2"])) & (df[colname].isnull())
        df.loc[ext_rows, colname] = df.loc[ext_rows, colname].fillna(-999)
        df[colname] = df[colname].map(float)
        
        

In [68]:
type(team_id_list_swap[1])

int

In [58]:
train_2018_df.head()

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std
0,2,C大阪,2.0,松田　陸,2524.0,DF,171.0,69.0,2000,3.0,,,47.0,3.0,42.0,2.0,,,,,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,C大阪,2.354326
1,5,C大阪,5.0,田中　裕介,279.0,DF,181.0,77.0,2700,4.0,,,223.0,10.0,42.0,1.0,,,,,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,C大阪,3.17834
2,6,C大阪,6.0,山口　蛍,2970.0,MF,173.0,72.0,6700,10.0,1.0,,134.0,12.0,58.0,2.0,,,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,C大阪,7.886992
3,7,C大阪,7.0,水沼　宏太,1671.0,MF,176.0,72.0,3000,2.0,,,194.0,24.0,50.0,7.0,9.0,3.0,,,鳥栖,FC東京,C大阪,1,1,1,32,17,24,7,1,3,2353,570,1901,2018,0.264706,0.088235,0.352941,0.0,0.088235,0.0,0.0,0.0,0.029412,0.0,0.088235,0.0,0.029412,0.088235,0.029412,0.0,0.0,0.0,0.088235,0.0,0.147059,0.0,0.0,0.0,0.147059,0.0,0.147059,0.0,0.0,0.0,28,C大阪,3.531489
4,8,C大阪,8.0,柿谷　曜一朗,1424.0,FW,177.0,68.0,7200,10.0,1.0,,113.0,39.0,168.0,23.0,,,18.0,5.0,SUI,C大阪,C大阪,9,2,1,4,20,34,1,5,6,-,1635,2918,2018,0.558824,0.0,0.441176,0.0,0.0,0.0,0.176471,0.0,0.058824,0.0,0.0,0.0,0.176471,0.0,0.058824,0.0,0.0,0.0,0.029412,0.0,0.205882,0.0,0.0,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,28,C大阪,8.475574


In [59]:
test_df.head(2)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,0,C大阪,1.0,圍　謙太朗,GK,190.0,90.0,750,2.0,0.0,japan,0.0,0.0,0.0,0.0,30.0,0.0,0.0,0.0,FC東京,C大阪,福岡,1,1,2,0,0,26,0,0,0,0,0,2340,2019,0.619048,0.0,0.0,0.0,0.333333,0.0,0.02381,0.0,0.0,0.0,0.190476,0.0,0.238095,0.0,0.0,0.0,0.0,0.0,0.214286,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.0,27,C大阪,0.668568,0,0,1,0,0,1,0
1,1,C大阪,2.0,松田　陸,DF,171.0,69.0,2700,4.0,0.0,japan,76.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,1,1,42,31,29,2,2,0,3733,2662,2524,2019,0.735294,0.0,0.117647,0.0,0.058824,0.0,0.205882,0.0,0.0,0.0,0.029412,0.0,0.205882,0.0,0.029412,0.0,0.0,0.0,0.176471,0.0,0.029412,0.0,0.029412,0.0,0.147059,0.0,0.058824,0.0,0.0,0.0,27,C大阪,2.406846,1,1,0,0,1,1,1


In [60]:
train_df.head(3)

Unnamed: 0,id,team,No,name,time_played,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,C大阪,2.0,松田　陸,2524.0,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,C大阪,2.354326,1,1,0,0,0,1,1
1,5,C大阪,5.0,田中　裕介,279.0,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,C大阪,3.17834,1,1,0,0,1,1,1
2,6,C大阪,6.0,山口　蛍,2970.0,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,C大阪,7.886992,1,1,0,1,1,1,1


In [61]:
ID = 'id'
TARGET = 'time_played'

train_x = train_df.drop([TARGET], axis=1)
train_y = train_df[TARGET]
test_x = test_df.copy()

all_x = pd.concat([train_x, test_x])

In [62]:
all_x.head(3)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,C大阪,2.0,松田　陸,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,C大阪,2.354326,1,1,0,0,0,1,1
1,5,C大阪,5.0,田中　裕介,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,C大阪,3.17834,1,1,0,0,1,1,1
2,6,C大阪,6.0,山口　蛍,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,C大阪,7.886992,1,1,0,1,1,1,1


In [63]:
team_dic_swap = {v: k for k,v in team_dic.items()}
all_x["team_id"] = all_x["team_id"].map(team_dic_swap)
pd.set_option("display.max_columns", 100)
all_x.head(3)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team
0,2,C大阪,2.0,松田　陸,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,20,2.354326,1,1,0,0,0,1,1
1,5,C大阪,5.0,田中　裕介,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,20,3.17834,1,1,0,0,1,1,1
2,6,C大阪,6.0,山口　蛍,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,20,7.886992,1,1,0,1,1,1,1


In [64]:
team_dic

{1: '鹿島',
 10: '広島',
 12: '湘南',
 13: '磐田',
 14: '札幌',
 18: '神戸',
 20: 'C大阪',
 21: '川崎F',
 22: 'FC東京',
 3: '浦和',
 31: '大分',
 33: '鳥栖',
 46: '松本',
 5: '横浜FM',
 54: '仙台',
 7: '清水',
 8: '名古屋',
 9: 'G大阪',
 11: '柏',
 2: '千葉',
 23: '福岡',
 24: '京都',
 27: '大宮',
 275: '金沢',
 277: '琉球',
 28: '甲府',
 29: '山形',
 330: '山口',
 338: '鹿児島',
 34: '横浜FC',
 36: '徳島',
 37: '愛媛',
 39: '岐阜',
 4: '東京V',
 40: '栃木',
 42: '岡山',
 45: '町田',
 47: '長崎',
 78: '新潟',
 94: '水戸',
 38: '熊本',
 48: '讃岐',
 35: '群馬',
 43: '北九州'}

In [70]:
#出場時間をチーム毎に標準化

all_x["time_std_1"] = all_x["prev1_time_played"].copy()
all_x["time_std_2"] = all_x["prev2_time_played"].copy()
all_x["time_std_3"] = all_x["prev3_time_played"].copy()
all_x.head(3)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,prev1_scores,prev3_time_played,prev2_time_played,prev1_time_played,year,rat_full_play,rat_out_play,rat_in_play,rat_inout_play,rat_bench_play,rat_susp_play,rat_full_play_first,rat_out_play_first,rat_in_play_first,rat_inout_play_first,rat_bench_play_first,rat_susp_play_first,rat_full_play_second,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,time_std_1,time_std_2,time_std_3
0,2,C大阪,2.0,松田　陸,DF,171.0,69.0,2000,3.0,0.0,japan,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,FC東京,C大阪,C大阪,1,2,1,9,42,31,0,2,2,473,3733,2662,2018,0.558824,0.0,0.352941,0.0,0.088235,0.0,0.147059,0.0,0.088235,0.0,0.0,0.0,0.088235,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,20,2.354326,1,1,0,0,0,1,1,2662,3733,473
1,5,C大阪,5.0,田中　裕介,DF,181.0,77.0,2700,4.0,0.0,japan,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,C大阪,C大阪,C大阪,2,2,1,9,33,18,1,0,0,664,2725,492,2018,0.117647,0.411765,0.0,0.0,0.470588,0.0,0.0,0.088235,0.0,0.0,0.147059,0.0,0.029412,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,20,3.17834,1,1,0,0,1,1,1,492,2725,664
2,6,C大阪,6.0,山口　蛍,MF,173.0,72.0,6700,10.0,1.0,japan,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,C大阪・GER,C大阪,C大阪,2,2,1,35,19,32,1,1,2,3150,1800,2880,2018,0.941176,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,20,7.886992,1,1,0,1,1,1,1,2880,1800,3150


In [73]:
#出場時間をチーム毎に標準化

all_x["time_std_1"] = all_x["prev1_time_played"].copy()
all_x["time_std_2"] = all_x["prev2_time_played"].copy()
all_x["time_std_3"] = all_x["prev3_time_played"].copy()

for i in team_id_list_swap:
    s = all_x.loc[all_x["team_id"] == i, "prev1_time_played"].sum()
    all_x.loc[all_x["team_id"] == i, 'time_std_1'] /= s
    s = all_x.loc[all_x["team_id"] == i, "prev2_time_played"].sum()
    all_x.loc[all_x["team_id"] == i, 'time_std_2'] /= s
    s = all_x.loc[all_x["team_id"] == i, "prev3_time_played"].sum()
    all_x.loc[all_x["team_id"] == i, 'time_std_3'] /= s        
all_x["time_std_1"] = all_x["time_std_1"].map(float)
all_x["time_std_2"] = all_x["time_std_2"].map(float)
all_x["time_std_3"] = all_x["time_std_3"].map(float)  

In [80]:
all_x.head(3)

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,...,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,time_std_1,time_std_2,time_std_3
0,2,130,2.0,松田　陸,0,171.0,69.0,2000,3.0,0.0,14,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,141,130,130,1,2,1,9,42,31,0,2,...,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,20,2.354326,1,1,0,0,0,1,1,0.024729,0.039568,0.00583
1,5,130,5.0,田中　裕介,0,181.0,77.0,2700,4.0,0.0,14,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,130,130,130,2,2,1,9,33,18,1,0,...,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,20,3.17834,1,1,0,0,1,1,1,0.00457,0.028883,0.008185
2,6,130,6.0,山口　蛍,3,173.0,72.0,6700,10.0,1.0,14,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,3,130,130,2,2,1,35,19,32,1,1,...,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,20,7.886992,1,1,0,1,1,1,1,0.026754,0.019079,0.038828


In [74]:
# カテゴリ変数はラベルエンコーディング
cat_columns = ["position", "nationality"]
for c in cat_columns:
    le = LabelEncoder()
    all_x[c].fillna("", inplace = True)
    le.fit(all_x[c])
    all_x[c] = le.transform(all_x[c])

In [75]:
# チーム名は、複数カラムで同じ値でラベルされるように、辞書作成して変換。ただしここでは出現回数が1回のチーム名は同じ値でラベル
team_dic = {}

tmp_dic = pd.concat([all_x['team'], all_x['prev1_team'], all_x['prev2_team'], all_x['prev3_team']]).value_counts().to_dict()
tmp_list = sorted(tmp_dic.items(), key=lambda x:x[-1])
tmp_list

[('BRA・CYP・BRA', 1),
 ('JAPANサッカーカレッジ', 1),
 ('広島・FC東京', 1),
 ('SUI・新潟', 1),
 ('長崎・清水', 1),
 ('FC東京・山口', 1),
 ('仙台・大宮', 1),
 ('神戸・SPA', 1),
 ('FC東京・松本', 1),
 ('BRA・甲府', 1),
 ('G大阪・QAT', 1),
 ('AUT・UAE', 1),
 ('広島・岡山', 1),
 ('浦和・長崎', 1),
 ('C大阪・鳥栖', 1),
 ('大分・BEL', 1),
 ('MDA', 1),
 ('川崎F・磐田', 1),
 ('BRA・THA', 1),
 ('千葉・大分', 1),
 ('名古屋・神戸', 1),
 ('松本・鳥栖', 1),
 ('鹿島・POR', 1),
 ('ITA・ENG・COL', 1),
 ('柏・仙台', 1),
 ('新潟・山口', 1),
 ('山口・長崎', 1),
 ('THA・THA', 1),
 ('清水・今治', 1),
 ('FC東京・鳥栖', 1),
 ('愛媛・C大阪', 1),
 ('ISR', 1),
 ('横浜FC・鳥栖', 1),
 ('UAE・TUR', 1),
 ('THI', 1),
 ('柏・名古屋', 1),
 ('鹿島・松本', 1),
 ('FC東京・富山', 1),
 ('CRO', 1),
 ('V大分', 1),
 ('徳島・神戸', 1),
 ('相模原', 1),
 ('横浜FM・甲府', 1),
 ('清水・大分', 1),
 ('甲府・岐阜', 1),
 ('Honda FC', 1),
 ('BEL・BRA', 1),
 ('CYP', 1),
 ('G大阪・SPA', 1),
 ('C大阪・SUI', 1),
 ('BUL・UKR', 1),
 ('FC東京・熊本', 1),
 ('SC相模原・札幌', 1),
 ('清水・鳥栖', 1),
 ('松本・名古屋', 1),
 ('横浜FM・千葉', 1),
 ('ISL', 1),
 ('湘南・愛媛', 1),
 ('ARG・ARG', 1),
 ('POR・RUS', 1),
 ('浦和・水戸', 1),
 ('山口・G大阪', 1),
 ('新潟・岡山',

In [76]:
label = 0
for k,v in tmp_list:
    if v == 1:
        team_dic[k] = label
    else:
        team_dic[k] = label
        label += 1
        
team_dic

{'BRA・CYP・BRA': 0,
 'JAPANサッカーカレッジ': 0,
 '広島・FC東京': 0,
 'SUI・新潟': 0,
 '長崎・清水': 0,
 'FC東京・山口': 0,
 '仙台・大宮': 0,
 '神戸・SPA': 0,
 'FC東京・松本': 0,
 'BRA・甲府': 0,
 'G大阪・QAT': 0,
 'AUT・UAE': 0,
 '広島・岡山': 0,
 '浦和・長崎': 0,
 'C大阪・鳥栖': 0,
 '大分・BEL': 0,
 'MDA': 0,
 '川崎F・磐田': 0,
 'BRA・THA': 0,
 '千葉・大分': 0,
 '名古屋・神戸': 0,
 '松本・鳥栖': 0,
 '鹿島・POR': 0,
 'ITA・ENG・COL': 0,
 '柏・仙台': 0,
 '新潟・山口': 0,
 '山口・長崎': 0,
 'THA・THA': 0,
 '清水・今治': 0,
 'FC東京・鳥栖': 0,
 '愛媛・C大阪': 0,
 'ISR': 0,
 '横浜FC・鳥栖': 0,
 'UAE・TUR': 0,
 'THI': 0,
 '柏・名古屋': 0,
 '鹿島・松本': 0,
 'FC東京・富山': 0,
 'CRO': 0,
 'V大分': 0,
 '徳島・神戸': 0,
 '相模原': 0,
 '横浜FM・甲府': 0,
 '清水・大分': 0,
 '甲府・岐阜': 0,
 'Honda FC': 0,
 'BEL・BRA': 0,
 'CYP': 0,
 'G大阪・SPA': 0,
 'C大阪・SUI': 0,
 'BUL・UKR': 0,
 'FC東京・熊本': 0,
 'SC相模原・札幌': 0,
 '清水・鳥栖': 0,
 '松本・名古屋': 0,
 '横浜FM・千葉': 0,
 'ISL': 0,
 '湘南・愛媛': 0,
 'ARG・ARG': 0,
 'POR・RUS': 0,
 '浦和・水戸': 0,
 '山口・G大阪': 0,
 '新潟・岡山': 0,
 'FC東京・名古屋': 0,
 'LUX': 0,
 '柏・神戸': 0,
 'FC東京・AUT': 0,
 'FC東京・湘南': 0,
 'BRA・BRA・BRA・BRA': 0,
 '柏・富山': 0,
 'UAE': 0,
 '山形・

In [77]:
colnames = ['team', 'prev1_team', 'prev2_team', 'prev3_team']
for colname in colnames:
    all_x[colname] = all_x[colname].map(team_dic)

pd.set_option("display.max_columns",61)
all_x.head()

Unnamed: 0,id,team,No,name,position,height,weight,salary,nth_year,is_youth,nationality,j1_total_num_played,j1_total_scores,j2_total_num_played,j2_total_scores,j3_total_num_played,j3_total_scores,na_total_num_played,na_total_scores,prev3_team,prev2_team,prev1_team,prev3_div,prev2_div,prev1_div,prev3_num_played,prev2_num_played,prev1_num_played,prev3_scores,prev2_scores,...,rat_out_play_second,rat_in_play_second,rat_inout_play_second,rat_bench_play_second,rat_susp_play_second,rat_full_play_third,rat_out_play_third,rat_in_play_third,rat_inout_play_third,rat_bench_play_third,rat_susp_play_third,rat_full_play_forth,rat_out_play_forth,rat_in_play_forth,rat_inout_play_forth,rat_bench_play_forth,rat_susp_play_forth,age,team_id,salary_std,is_j1_play,is_j2_play,is_j3_play,is_na_play,is_prev3_same_team,is_prev2_same_team,is_prev1_same_team,time_std_1,time_std_2,time_std_3
0,2,130,2.0,松田　陸,0,171.0,69.0,2000,3.0,0.0,14,47.0,3.0,42.0,2.0,0.0,0.0,0.0,0.0,141,130,130,1,2,1,9,42,31,0,2,...,0.0,0.117647,0.0,0.029412,0.0,0.147059,0.0,0.029412,0.0,0.058824,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,26,20,2.354326,1,1,0,0,0,1,1,0.024729,0.039568,0.00583
1,5,130,5.0,田中　裕介,0,181.0,77.0,2700,4.0,0.0,14,223.0,10.0,42.0,1.0,0.0,0.0,0.0,0.0,130,130,130,2,2,1,9,33,18,1,0,...,0.117647,0.0,0.0,0.088235,0.0,0.058824,0.058824,0.0,0.0,0.117647,0.0,0.029412,0.147059,0.0,0.0,0.117647,0.0,31,20,3.17834,1,1,0,0,1,1,1,0.00457,0.028883,0.008185
2,6,130,6.0,山口　蛍,3,173.0,72.0,6700,10.0,1.0,14,134.0,12.0,58.0,2.0,0.0,0.0,38.0,2.0,3,130,130,2,2,1,35,19,32,1,1,...,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,27,20,7.886992,1,1,0,1,1,1,1,0.026754,0.019079,0.038828
3,7,130,7.0,水沼　宏太,3,176.0,72.0,3000,2.0,0.0,14,194.0,24.0,50.0,7.0,9.0,3.0,0.0,0.0,136,141,130,1,1,1,32,17,24,7,1,...,0.088235,0.029412,0.0,0.0,0.0,0.088235,0.0,0.147059,0.0,0.0,0.0,0.147059,0.0,0.147059,0.0,0.0,0.0,28,20,3.531489,1,1,1,0,0,0,1,0.017659,0.006042,0.029004
4,8,130,8.0,柿谷　曜一朗,1,177.0,68.0,7200,10.0,1.0,14,113.0,39.0,168.0,23.0,0.0,0.0,18.0,5.0,15,130,130,9,2,1,4,20,34,1,5,...,0.0,0.058824,0.0,0.0,0.0,0.029412,0.0,0.205882,0.0,0.0,0.0,0.176471,0.0,0.117647,0.0,0.0,0.0,28,20,8.475574,1,1,0,1,0,1,1,0.027107,0.01733,0.0


In [78]:
train_x = all_x.iloc[:train_x.shape[0], :].reset_index(drop=True)
test_x = all_x.iloc[train_x.shape[0]:, :].reset_index(drop=True)

In [79]:
# 学習
remove_cols = [ID, TARGET, 'name', 'year']
feature_cols = [col for col in list(train_x) if col not in remove_cols]
cat_cols = ['team', 'prev1_team', 'prev2_team', 'prev3_team', 'position', 'nationality']
SEED = 0

# train, validationのsplitはyearで層化抽出する
tr_x, va_x, tr_y, va_y = train_test_split(train_x, train_y, test_size=0.2, random_state=SEED, stratify=train_x['year'])

tr_x = tr_x[feature_cols]
va_x = va_x[feature_cols]
tt_x = test_x[feature_cols]

tr_x.reset_index(drop=True, inplace=True)
va_x.reset_index(drop=True, inplace=True)
tr_y.reset_index(drop=True, inplace=True)
va_y.reset_index(drop=True, inplace=True)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 14,
    'max_depth': 6,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.7,
    'min_data_in_leaf': 10,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.4,
    "lambda_l2": 0.4,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

tr_data = lgb.Dataset(tr_x, label=tr_y)
va_data = lgb.Dataset(va_x, label=va_y)

model = lgb.train(
    params, 
    tr_data, 
    categorical_feature = cat_cols,
    valid_names = ['train', 'valid'],
    valid_sets =[tr_data, va_data], 
    verbose_eval = 100,
)

va_pred = model.predict(va_x, num_iteration=model.best_iteration)
score = np.sqrt(mean_squared_error(va_y, va_pred))

pred_df = pd.DataFrame(sorted(zip(va_x.index, va_pred, va_y)), columns=['index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(), tr_x.columns)), columns=['importance', 'feature'])

print(f'rmse: {score:.4f}')

New categorical_feature is ['nationality', 'position', 'prev1_team', 'prev2_team', 'prev3_team', 'team']


Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 371.485	valid's rmse: 813.899
Early stopping, best iteration is:
[42]	train's rmse: 540.582	valid's rmse: 799.842
rmse: 799.8422


In [88]:
tt_pred = model.predict(tt_x, num_iteration=model.best_iteration)
test_df[TARGET] = tt_pred

sub_df = pd.read_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/input/sample_submission.csv')

sub_df = pd.merge(sub_df[[ID]], test_df[[ID, TARGET]], on=ID)
sub_df.to_csv('/Users/onehe/Desktop/コンペティション/Jリーグ選手出場時間予測_nishika/data/output/4div改年俸・出場時間_submission.csv', index=False)