# Homework for JLeague competition

## Setting up

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

import seaborn as sns
import matplotlib.pyplot as plt

# Path
input_path = "../input_data/"

# Figure Size
plt.rcParams["figure.figsize"] = (20.0, 10.0)

# Set Display Max Columns
pd.set_option("display.max_columns", 50)

# Set Display Max Rows
pd.set_option("display.max_rows", 200)

# Load datasets
train = pd.read_csv(input_path + "jleague/train.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
test = pd.read_csv(input_path + "jleague/test.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
condition = pd.read_csv(input_path + "jleague/condition.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')
stadium = pd.read_csv(input_path + "jleague/stadium.csv", sep=",", header=0, quotechar="\"", encoding='utf-8')

## Merging datasets for feature engineering

In [2]:
# Train and Test
dat = pd.concat([train, test], axis=0)
dat["data"] = dat["y"].apply(lambda x: "Test" if np.isnan(x) else "Train")

# condition
dat = pd.merge(dat, condition, how="inner", on="id")

# stadium
dat = pd.merge(dat, stadium, how="inner", left_on="stadium", right_on="name")
dat.shape

(2034, 45)

## Feature engineering

In [3]:
# Original FE by sensei
# J1フラグ
dat["J1_flg"] = dat["stage"].apply(lambda x: 1 if x=="Ｊ１" else 0)

# 12月フラグ
dat["month"] = dat["gameday"].apply(lambda x: int(x.split("/")[0]))
dat["December_flg"] = dat["month"].apply(lambda x: 1 if x==12 else 0)

# ホームチームごとの平均観客動員数(y=0の試合を除く)
team_attract = dat[(dat["data"]=="Train") & (dat["y"] > 0)].groupby("home").agg({"y": "mean"}).reset_index(drop=False)
team_attract.columns = ["home", "attract_mean"]
dat = pd.merge(dat, team_attract, how="inner", on="home")

# TV放送数
dat["tv_N"] = dat["tv"].apply(lambda x: len(x.split("／")))

# NHKフラグ
dat["tv_NHK_flg"] = dat["tv"].apply(lambda x: 1 if "ＮＨＫ" in x else 0)

# capa
dat["capa_cate"] = dat["capa"].apply(lambda x: x // 10000)

In [4]:
# Added for V2 submission
# calculate capa %
capa_pct = dat[(dat["data"]=="Train") & (dat["y"] > 0)].assign(capa_pct=lambda x: x["y"]/x["capa"])
# home team and stadium mean capa %
team_capa_pct = capa_pct.groupby("home").agg(team_capa_pct=("capa_pct","mean")).reset_index()
stadium_capa_pct = capa_pct.groupby("stadium").agg(stadium_capa_pct=("capa_pct","mean")).reset_index()
dat = pd.merge(dat, team_capa_pct, how="left", on="home")
dat = pd.merge(dat, stadium_capa_pct, how="left", on='stadium')
dat['stadium_capa_pct'] = dat['stadium_capa_pct'].fillna(stadium_capa_pct['stadium_capa_pct'].mean())
dat["team_capa_pct_y"] = dat['team_capa_pct'] * dat['capa']
dat['stadium_capa_pct_y'] = dat['stadium_capa_pct'] * dat['capa']

In [43]:
# For V3 submission
# getting all players
# setting player name columns
home_col = []
away_col = []
for n in range(1,12):
    home_col.append('home_'+f"{n:02d}")
    away_col.append('away_'+f"{n:02d}")

player_col = home_col + away_col

# unique player list from columns (train+test)
player = dat[player_col].stack().unique().tolist()
# adding list of players in single column
dat['home_players'] = tuple(dat[home_col].itertuples(index=False))
dat['away_players'] = tuple(dat[away_col].itertuples(index=False))
dat['players'] = tuple(dat[player_col].itertuples(index=False))
capa_pct['home_players'] = tuple(capa_pct[home_col].itertuples(index=False))
capa_pct['away_players'] = tuple(capa_pct[away_col].itertuples(index=False))
capa_pct['players'] = tuple(capa_pct[player_col].itertuples(index=False))

# compute player popularity
player_df = pd.DataFrame()
player_capa_pct = []
player_count = []
total_y = []
for name in player:
    played = capa_pct[ capa_pct['players'].apply(lambda x: name in x)]
    player_capa_pct.append(played['capa_pct'].mean())
    player_count.append(len(played['capa_pct']))
    total_y.append(played['y'].sum())
    
player_df['name'] = player
player_df['capa_pct'] = player_capa_pct
player_df['match_count'] = player_count
player_df['total_y'] = total_y
# fill in missing data with mean capa pct (players not in train)
player_df['capa_pct'] = player_df['capa_pct'].fillna(capa_pct['capa_pct'].mean())

player_df.head(3)

Unnamed: 0,name,capa_pct,match_count,total_y
0,林　卓人,0.640082,75,1243211.0
1,菅井　直樹,0.658546,58,924031.0
2,鎌田　次郎,0.654062,65,1045027.0


In [46]:
#find hot player?
player_df.sort_values(by='capa_pct',ascending=False).query('match_count >= 5').head(3)

Unnamed: 0,name,capa_pct,match_count,total_y
1146,ゴイコ　カチャル,0.769855,10,272282.0
982,ニール,0.764991,5,116824.0
1139,フォルラン,0.762775,13,368923.0


In [107]:
# compute mean capa_pct for players of both teams, home, and away
match_players = dat.groupby("players").agg(allplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in match_players["players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

match_players["mean_allplayer_capa_pct"] = mean_capa_pct
match_players["mean_allplayer_match_count"] = mean_match_count

# home players
home_players = dat.groupby("home_players").agg(homeplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in home_players["home_players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

home_players["mean_home_capa_pct"] = mean_capa_pct
home_players["mean_home_match_count"] = mean_match_count

# away players
away_players = dat.groupby("away_players").agg(awayplayer_y=("y","sum")).reset_index()
mean_capa_pct = []
mean_match_count = []
for match in away_players["away_players"]:
    members = player_df[ player_df["name"].apply(lambda x: x in match) ]
    mean_capa_pct.append(members["capa_pct"].mean())
    mean_match_count.append(members["match_count"].mean())

away_players["mean_away_capa_pct"] = mean_capa_pct
away_players["mean_away_match_count"] = mean_match_count

match_players.shape, home_players.shape, away_players.shape

((2034, 4), (1858, 4), (1893, 4))

In [108]:
# Merge into dat
dat = pd.merge(dat, match_players, how="inner", on="players")
dat = pd.merge(dat, home_players, how="inner", on="home_players")
dat = pd.merge(dat, away_players, how="inner", on="away_players")
dat["mean_home_capa_y"] = dat["mean_home_capa_pct"] * dat["capa"]
dat["mean_away_capa_y"] = dat["mean_away_capa_pct"] * dat["capa"]
dat["mean_allplayer_capa_y"] = dat["mean_allplayer_capa_pct"] * dat["capa"]
dat.shape

(2034, 89)

## Check Multicollinearity
　多重共線性の確認は, 相関係数にて行う.

In [109]:
dat[["J1_flg", "December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate",
     'team_capa_pct_y','stadium_capa_pct_y',
    "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].corr(method="spearman")

Unnamed: 0,J1_flg,December_flg,attract_mean,tv_N,tv_NHK_flg,capa_cate,team_capa_pct_y,stadium_capa_pct_y,mean_home_capa_y,mean_away_capa_y,mean_allplayer_capa_y
J1_flg,1.0,0.133775,0.790255,0.102055,0.209452,0.303099,0.763627,0.733872,0.729628,0.707027,0.736597
December_flg,0.133775,1.0,0.108735,0.075221,0.100478,0.070637,0.1055,0.101055,0.104457,0.108737,0.109194
attract_mean,0.790255,0.108735,1.0,0.077389,0.181134,0.515022,0.962358,0.93837,0.908519,0.743515,0.848482
tv_N,0.102055,0.075221,0.077389,1.0,0.486005,0.055149,0.082448,0.065706,0.048444,0.057744,0.046573
tv_NHK_flg,0.209452,0.100478,0.181134,0.486005,1.0,0.148563,0.19546,0.187519,0.192627,0.196323,0.199185
capa_cate,0.303099,0.070637,0.515022,0.055149,0.148563,1.0,0.594104,0.626437,0.724786,0.758197,0.759616
team_capa_pct_y,0.763627,0.1055,0.962358,0.082448,0.19546,0.594104,1.0,0.973476,0.945636,0.78204,0.885571
stadium_capa_pct_y,0.733872,0.101055,0.93837,0.065706,0.187519,0.626437,0.973476,1.0,0.958043,0.805315,0.902909
mean_home_capa_y,0.729628,0.104457,0.908519,0.048444,0.192627,0.724786,0.945636,0.958043,1.0,0.87851,0.964018
mean_away_capa_y,0.707027,0.108737,0.743515,0.057744,0.196323,0.758197,0.78204,0.805315,0.87851,1.0,0.96927


## Make Model

In [110]:
# Train
train = dat[(dat["data"]=="Train") & (dat["y"] > 0)].copy()
train_x = train[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
                 'team_capa_pct_y','stadium_capa_pct_y',
                "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()
train_y = train[["y"]].copy()

# Test
test = dat[dat["data"]=="Test"].copy()
test_x = test[["December_flg", "attract_mean", "tv_N", "tv_NHK_flg", "capa_cate", "J1_flg",
               'team_capa_pct_y','stadium_capa_pct_y',
              "mean_home_capa_y","mean_away_capa_y","mean_allplayer_capa_y"]].copy()

# Learning
model = linear_model.LinearRegression()
model.fit(train_x, train_y)
model.coef_

array([[ 4.79062241e+03,  3.61722539e-01, -6.90571738e+01,
         1.21505414e+03, -9.73367262e+02, -6.44156735e+02,
         5.57003265e-01,  1.05230421e-01,  4.25573799e+01,
         4.32274063e+01, -8.56430806e+01]])

## Predict

In [111]:
pred = model.predict(test_x)

## Predicted Value Correction

In [112]:
y_min = train_y.min()
y_min

y    1104.0
dtype: float64

In [113]:
def correction_fun_1(row):
    return max(row["pred"], 1104)

def correction_fun_2(row):
    return min(row["pred"], row["capa"])

pred_correction = test[["capa"]].copy()
pred_correction["pred"] = pred
pred_correction["pred"] = pred_correction.apply(correction_fun_1, axis=1)
pred_correction["pred"] = pred_correction.apply(correction_fun_2, axis=1)
pred_correction["pred"]

1721    14849.067286
1722    15152.572700
1723    35007.179233
1724    11857.432664
1725    26461.851071
            ...     
2029     4340.006259
2030     7844.488438
2031     3820.891101
2032     4861.081768
2033    11343.567445
Name: pred, Length: 313, dtype: float64

## Output

In [114]:
submit = test[["id"]].copy()
submit["pred"] = pred_correction["pred"]
submit.describe()

Unnamed: 0,id,pred
count,313.0,313.0
mean,16142.252396,11249.683013
std,224.441223,7556.964775
min,15822.0,2746.494004
25%,15907.0,5318.26524
50%,16261.0,9353.84978
75%,16346.0,13786.686328
max,16436.0,41644.893429


In [115]:
submit.to_csv("../submit/20251014_submit_linear_v3.csv",  sep=",", index=False, header=False)

## Done!

In [116]:
player_df.sort_values(by='capa_pct',ascending=False).head(3)

Unnamed: 0,name,capa_pct,match_count,total_y
357,内山　俊彦,0.910683,1,17935.0
959,増田　卓也,0.856731,1,13365.0
366,ネット　バイアーノ,0.813161,3,41352.0
