# Homework for JLeague competition

## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

import seaborn as sns
import matplotlib.pyplot as plt

# Path
input_path = "../input_data/"

# Figure Size
plt.rcParams["figure.figsize"] = (20.0, 10.0)

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

# Set Display Max Rows
pd.set_option("display.max_rows", 200)

# Load datasets
train = pd.read_csv(input_path + "jleague/train.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
test = pd.read_csv(input_path + "jleague/test.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
condition = pd.read_csv(input_path + "jleague/condition.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
stadium = pd.read_csv(input_path + "jleague/stadium.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')

## Merging datasets for feature engineering

In [2]:
# Train and Test
dat = pd.concat([train, test], axis=0)
dat["data"] = dat["y"].apply(lambda x: "Test" if np.isnan(x) else "Train")

# condition
dat = pd.merge(dat, condition, how="inner", on="id")

# stadium
dat = pd.merge(dat, stadium, how="inner", left_on="stadium", right_on="name")
dat.shape

(2034, 45)

## Feature engineering

In [3]:
# Original FE by sensei
# J1フラグ
dat["J1_flg"] = dat["stage"].apply(lambda x: 1 if x=="Ｊ１" else 0)

# 12月フラグ
dat["month"] = dat["gameday"].apply(lambda x: int(x.split("/")[0]))
dat["December_flg"] = dat["month"].apply(lambda x: 1 if x==12 else 0)

# ホームチームごとの平均観客動員数(y=0の試合を除く)
team_attract = dat[(dat["data"]=="Train") & (dat["y"] > 0)].groupby("home").agg({"y": "mean"}).reset_index(drop=False)
team_attract.columns = ["home", "attract_mean"]
dat = pd.merge(dat, team_attract, how="inner", on="home")

# TV放送数
dat["tv_N"] = dat["tv"].apply(lambda x: len(x.split("／")))

# NHKフラグ
dat["tv_NHK_flg"] = dat["tv"].apply(lambda x: 1 if "ＮＨＫ" in x else 0)

# capa
dat["capa_cate"] = dat["capa"].apply(lambda x: x // 10000)

In [4]:
# Added for V2 submission
# calculate capa %
capa_pct = dat[(dat["data"]=="Train") & (dat["y"] > 0)].assign(capa_pct=lambda x: x["y"]/x["capa"])
# home team and stadium mean capa %
team_capa_pct = capa_pct.groupby("home").agg(team_capa_pct=("capa_pct","mean")).reset_index()
stadium_capa_pct = capa_pct.groupby("stadium").agg(stadium_capa_pct=("capa_pct","mean")).reset_index()
dat = pd.merge(dat, team_capa_pct, how="left", on="home")
dat = pd.merge(dat, stadium_capa_pct, how="left", on='stadium')
dat['stadium_capa_pct'] = dat['stadium_capa_pct'].fillna(stadium_capa_pct['stadium_capa_pct'].mean())
dat["team_capa_pct_y"] = dat['team_capa_pct'] * dat['capa']
dat['stadium_capa_pct_y'] = dat['stadium_capa_pct'] * dat['capa']

In [5]:
# For V3 submission
# getting all players
# setting player name columns
home_col = []
away_col = []
for n in range(1,12):
    home_col.append('home_'+f"{n:02d}")
    away_col.append('away_'+f"{n:02d}")

player_col = home_col + away_col

# unique player list from columns (train+test)
player = dat[player_col].stack().unique().tolist()
# adding list of players in single column
dat['home_players'] = tuple(dat[home_col].itertuples(index=False))
dat['away_players'] = tuple(dat[away_col].itertuples(index=False))
dat['players'] = tuple(dat[player_col].itertuples(index=False))
capa_pct['home_players'] = tuple(capa_pct[home_col].itertuples(index=False))
capa_pct['away_players'] = tuple(capa_pct[away_col].itertuples(index=False))
capa_pct['players'] = tuple(capa_pct[player_col].itertuples(index=False))

# compute player popularity
player_df = pd.DataFrame()
player_capa_pct = []
player_count = []
total_y = []
for name in player:
    played = capa_pct[ capa_pct['players'].apply(lambda x: name in x)]
    player_capa_pct.append(played['capa_pct'].mean())
    player_count.append(len(played['capa_pct']))
    total_y.append(played['y'].sum())
    
player_df['name'] = player
player_df['capa_pct'] = player_capa_pct
player_df['match_count'] = player_count
player_df['total_y'] = total_y
# fill in missing data with mean capa pct (players not in train)
player_df['capa_pct'] = player_df['capa_pct'].fillna(capa_pct['capa_pct'].mean())

player_df.head(3)

Unnamed: 0,name,capa_pct,match_count,total_y
0,林　卓人,0.640082,75,1243211.0
1,菅井　直樹,0.658546,58,924031.0
2,鎌田　次郎,0.654062,65,1045027.0


In [6]:
#find hot player?
player_df.sort_values(by='capa_pct',ascending=False).query('match_count >= 5').head(3)

Unnamed: 0,name,capa_pct,match_count,total_y
1146,ゴイコ　カチャル,0.769855,10,272282.0
982,ニール,0.764991,5,116824.0
1139,フォルラン,0.762775,13,368923.0


In [7]:
# compute mean capa_pct for players of both teams, home, and away
match_players = dat.groupby("players").agg(allplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in match_players["players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

match_players["mean_allplayer_capa_pct"] = mean_capa_pct
match_players["mean_allplayer_match_count"] = mean_match_count

# home players
home_players = dat.groupby("home_players").agg(homeplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in home_players["home_players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

home_players["mean_home_capa_pct"] = mean_capa_pct
home_players["mean_home_match_count"] = mean_match_count

# away players
away_players = dat.groupby("away_players").agg(awayplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in away_players["away_players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

away_players["mean_away_capa_pct"] = mean_capa_pct
away_players["mean_away_match_count"] = mean_match_count

match_players.shape, home_players.shape, away_players.shape

((2034, 4), (1858, 4), (1893, 4))

In [8]:
# Merge into dat
dat = pd.merge(dat, match_players, how="inner", on="players")
dat = pd.merge(dat, home_players, how="inner", on="home_players")
dat = pd.merge(dat, away_players, how="inner", on="away_players")
dat["mean_home_capa_y"] = dat["mean_home_capa_pct"] * dat["capa"]
dat["mean_away_capa_y"] = dat["mean_away_capa_pct"] * dat["capa"]
dat["mean_allplayer_capa_y"] = dat["mean_allplayer_capa_pct"] * dat["capa"]
dat.shape

(2034, 71)

## Check Multicollinearity
　多重共線性の確認は, 相関係数にて行う.

In [9]:
dat[["J1_flg", "December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate",
     'team_capa_pct_y','stadium_capa_pct_y',
    "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].corr(method="spearman")

Unnamed: 0,J1_flg,December_flg,attract_mean,tv_N,tv_NHK_flg,capa_cate,team_capa_pct_y,stadium_capa_pct_y,mean_home_capa_y,mean_away_capa_y,mean_allplayer_capa_y
J1_flg,1.0,0.133775,0.790255,0.102055,0.209452,0.303099,0.763627,0.733872,0.729628,0.707027,0.736597
December_flg,0.133775,1.0,0.108735,0.075221,0.100478,0.070637,0.1055,0.101055,0.104457,0.108737,0.109194
attract_mean,0.790255,0.108735,1.0,0.077389,0.181134,0.515022,0.962358,0.93837,0.908519,0.743515,0.848482
tv_N,0.102055,0.075221,0.077389,1.0,0.486005,0.055149,0.082448,0.065706,0.048444,0.057744,0.046573
tv_NHK_flg,0.209452,0.100478,0.181134,0.486005,1.0,0.148563,0.19546,0.187519,0.192627,0.196323,0.199185
capa_cate,0.303099,0.070637,0.515022,0.055149,0.148563,1.0,0.594104,0.626437,0.724786,0.758197,0.759616
team_capa_pct_y,0.763627,0.1055,0.962358,0.082448,0.19546,0.594104,1.0,0.973476,0.945636,0.78204,0.885571
stadium_capa_pct_y,0.733872,0.101055,0.93837,0.065706,0.187519,0.626437,0.973476,1.0,0.958043,0.805315,0.902909
mean_home_capa_y,0.729628,0.104457,0.908519,0.048444,0.192627,0.724786,0.945636,0.958043,1.0,0.87851,0.964018
mean_away_capa_y,0.707027,0.108737,0.743515,0.057744,0.196323,0.758197,0.78204,0.805315,0.87851,1.0,0.96927


## RMSE validation

In [12]:
from sklearn.model_selection import train_test_split
def rmse(y_actual, y_predicted):
    return np.sqrt(np.mean((y_predicted - y_actual) ** 2))

train = dat[(dat["data"]=="Train") & (dat["y"] > 0)].copy()
train_x = train[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
                 'team_capa_pct_y','stadium_capa_pct_y',
                "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()
train_y = train[["y"]].copy()

train_x, valid_x, train_y, valid_y= train_test_split(train_x, train_y, test_size=0.2, shuffle=False)
model = linear_model.LinearRegression().fit(train_x, train_y)
train_pred = model.predict(train_x)
valid_pred = model.predict(valid_x)
rmse(train_y, train_pred), rmse(valid_y, valid_pred)
# V3 (np.float64(3325.8692966966337), np.float64(3518.6294330183746))

(np.float64(3325.8692966966337), np.float64(3518.6294330183746))

In [28]:
train_x = train[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
                 'team_capa_pct_y','stadium_capa_pct_y',
                "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()
train_y = train[["y"]].copy()
model = linear_model.LinearRegression().fit(train_x, train_y)
train_pred = model.predict(train_x)

train["pred"] = train_pred
train["pred - y"] = train["pred"] - train_y["y"]
train["abs(pred - y)"] = train["pred - y"].abs()
train.drop(columns=player_col+["home_players","away_players","players","tv","name","address","data","referee","year","weather","humidity","time"]).sort_values(by="abs(pred - y)",ascending=False).head(10)
# flag popular team?, fix player_y, nice weather x temperature flag?, capa > 40,000 flag?

Unnamed: 0,id,y,stage,match,gameday,home,away,stadium,home_score,away_score,temperature,home_team,away_team,capa,J1_flg,month,December_flg,attract_mean,tv_N,tv_NHK_flg,capa_cate,team_capa_pct,stadium_capa_pct,team_capa_pct_y,stadium_capa_pct_y,allplayer_y,mean_allplayer_capa_pct,mean_allplayer_match_count,homeplayer_y,mean_home_capa_pct,mean_home_match_count,awayplayer_y,mean_away_capa_pct,mean_away_match_count,mean_home_capa_y,mean_away_capa_y,mean_allplayer_capa_y,pred,pred - y,abs(pred - y)
1443,15765,38966.0,Ｊ１,第１１節第１日,05/03(土・祝),名古屋グランパス,セレッソ大阪,豊田スタジアム,1,2,17.8,名古屋グランパス,セレッソ大阪,40000,1,5,0,16449.914286,4,1,4,0.543944,0.487466,21757.742857,19498.65,38966.0,0.60357,42.0,38966.0,0.578023,35.909091,38966.0,0.629117,48.090909,23120.904342,25164.670477,24142.787409,20925.249301,-18040.750699,18040.750699
693,14852,52293.0,Ｊ１,第２節第１日,03/09(土),浦和レッズ,名古屋グランパス,埼玉スタジアム２００２,1,0,21.6,浦和レッズ,名古屋グランパス,63700,1,3,0,37243.794872,3,0,6,0.584675,0.584675,37243.794872,37243.794872,52293.0,0.579678,52.818182,52293.0,0.599304,61.545455,52293.0,0.560052,44.090909,38175.666612,35675.322499,36925.494556,36136.287059,-16156.712941,16156.712941
899,15089,32305.0,Ｊ１,第２９節第１日,10/19(土),鹿島アントラーズ,浦和レッズ,県立カシマサッカースタジアム,1,2,18.1,鹿島アントラーズ,浦和レッズ,40728,1,10,0,15710.888889,2,0,4,0.385752,0.385752,15710.888889,15710.888889,32305.0,0.556424,54.0,49335.0,0.511211,48.454545,32305.0,0.601638,59.545455,20820.599197,24503.500794,22662.049996,16153.585619,-16151.414381,16151.414381
1409,15725,42723.0,Ｊ１,第７節第２日,04/12(土),セレッソ大阪,ガンバ大阪,ヤンマースタジアム長居,2,2,18.0,セレッソ大阪,ガンバ大阪,47816,1,4,0,19060.358974,3,1,4,0.646918,0.586877,30933.02562,28062.133333,42723.0,0.65759,45.818182,57955.0,0.682423,43.090909,42723.0,0.632756,48.545455,32630.734526,30255.878858,31443.306692,27504.59186,-15218.40814,15218.40814
1384,15698,32099.0,Ｊ１,第４節第１日,03/23(日),鹿島アントラーズ,セレッソ大阪,県立カシマサッカースタジアム,0,2,13.9,鹿島アントラーズ,セレッソ大阪,40728,1,3,0,15710.888889,2,0,4,0.385752,0.385752,15710.888889,15710.888889,32099.0,0.598819,42.409091,47132.0,0.515215,41.727273,72860.0,0.682423,43.090909,20983.670427,27793.720842,24388.695634,17445.897048,-14653.102952,14653.102952
917,15107,40371.0,Ｊ１,第３１節第１日,11/10(日),ＦＣ東京,セレッソ大阪,味の素スタジアム,1,2,21.1,ＦＣ東京,セレッソ大阪,49970,1,11,0,24850.763158,2,0,4,0.491663,0.308366,24568.419359,15409.03125,40371.0,0.597713,54.454545,40371.0,0.547416,55.727273,40371.0,0.648009,53.181818,27354.364456,32381.026074,29867.695265,25832.850123,-14538.149877,14538.149877
715,14880,23295.0,Ｊ１,第５節第１日,04/06(土),浦和レッズ,ジュビロ磐田,埼玉スタジアム２００２,2,1,15.6,浦和レッズ,ジュビロ磐田,63700,1,4,0,37243.794872,4,1,6,0.584675,0.584675,37243.794872,37243.794872,23295.0,0.592204,60.272727,59772.0,0.601854,63.545455,23295.0,0.582553,57.0,38338.107897,37108.639438,37723.373668,37821.108057,14526.108057,14526.108057
267,14289,32724.0,Ｊ１,第３３節第１日,11/24(土),サンフレッチェ広島,セレッソ大阪,エディオンスタジアム広島,4,1,10.3,サンフレッチェ広島,セレッソ大阪,50000,1,11,0,16380.945946,3,1,5,0.327619,0.327619,16380.945946,16380.945946,32724.0,0.571256,55.181818,32724.0,0.515033,59.181818,32724.0,0.627478,51.181818,25751.674466,31373.877243,28562.775855,18498.181468,-14225.818532,14225.818532
748,14916,24184.0,Ｊ１,第９節第２日,05/29(水),浦和レッズ,ベガルタ仙台,埼玉スタジアム２００２,1,1,21.2,浦和レッズ,ベガルタ仙台,63700,1,5,0,37243.794872,2,0,6,0.584675,0.584675,37243.794872,37243.794872,24184.0,0.625598,59.227273,24184.0,0.606536,60.363636,24184.0,0.64466,58.090909,38636.34869,41064.869627,39850.609158,38271.084098,14087.084098,14087.084098
721,14887,13384.0,Ｊ１,第６節第１日,04/13(土),セレッソ大阪,大宮アルディージャ,ヤンマースタジアム長居,1,2,13.3,セレッソ大阪,大宮アルディージャ,47816,1,4,0,19060.358974,3,1,4,0.646918,0.586877,30933.02562,28062.133333,13384.0,0.610403,50.727273,13384.0,0.627023,51.181818,13384.0,0.593784,50.272727,29981.718362,28392.381424,29187.049893,27448.026937,14064.026937,14064.026937


## Make Model

In [None]:
# Train
train = dat[(dat["data"]=="Train") & (dat["y"] > 0)].copy()
train_x = train[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
                 'team_capa_pct_y','stadium_capa_pct_y',
                "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()
train_y = train[["y"]].copy()

# Test
test = dat[dat["data"]=="Test"].copy()
test_x = test[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
               'team_capa_pct_y','stadium_capa_pct_y',
              "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()

# Learning
model = linear_model.LinearRegression()
model.fit(train_x, train_y)
model.coef_

## Predict

In [None]:
pred = model.predict(test_x)

## Predicted Value Correction

In [None]:
y_min = train_y.min()
y_min

In [None]:
def correction_fun_1(row):
    return max(row["pred"], 1104)

def correction_fun_2(row):
    return min(row["pred"], row["capa"])

pred_correction = test[["capa"]].copy()
pred_correction["pred"] = pred
pred_correction["pred"] = pred_correction.apply(correction_fun_1, axis=1)
pred_correction["pred"] = pred_correction.apply(correction_fun_2, axis=1)
pred_correction["pred"]

## Output

In [None]:
submit = test[["id"]].copy()
submit["pred"] = pred_correction["pred"]
submit.describe()

In [None]:
#submit.to_csv("../submit/20251014_submit_linear_v3.csv",  sep=",", index=False, header=False)

## Done!

In [None]:
player_df.sort_values(by='capa_pct',ascending=False).head(3)