# ベースラインのコード
- （2025年10月12日時点）猪メモ
- 特徴量エンジニアリングはmake_features関数を参照
- 現時点での最高スコアは、勾配木回帰（GradientBoosting）を用いた「3633.772917813335」
- いろんな仮説に基づく特徴量は入れ込めていないが一旦

## データ読み込み

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import font_manager as fm
from sklearn.linear_model import (LinearRegression, Ridge, Lasso, ElasticNet,HuberRegressor, TheilSenRegressor, QuantileRegressor)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

fm._load_fontmanager(try_read_cache=False)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
np.set_printoptions(precision=2, suppress=True, linewidth=600, threshold=np.inf,floatmode="fixed")
plt.rcdefaults()
sns.set_style("whitegrid") 
plt.rcParams['font.family'] = 'IPAexGothic'
plt.rcParams['font.size'] = 8
plt.rcParams['axes.labelsize'] = 8 
plt.rcParams['axes.titlesize'] = 10
plt.rcParams['xtick.labelsize'] = 8
plt.rcParams['ytick.labelsize'] = 8
plt.rcParams['figure.figsize'] = (8, 3) 
plt.rcParams['figure.dpi'] = 150
plt.rcParams['grid.alpha'] = 0.3
plt.rcParams['grid.linestyle'] = '--'
plt.rcParams['axes.linewidth'] = 1.2 

pref_list = [
    "北海道","青森県","岩手県","宮城県","秋田県","山形県","福島県",
    "茨城県","栃木県","群馬県","埼玉県","千葉県","東京都","神奈川県",
    "新潟県","富山県","石川県","福井県","山梨県","長野県","岐阜県",
    "静岡県","愛知県","三重県","滋賀県","京都府","大阪府","兵庫県",
    "奈良県","和歌山県","鳥取県","島根県","岡山県","広島県","山口県",
    "徳島県","香川県","愛媛県","高知県","福岡県","佐賀県","長崎県",
    "熊本県","大分県","宮崎県","鹿児島県","沖縄県"
]

train = pd.read_csv("../input_data/jleague/train.csv", sep=",", header=0, quotechar="\"")#学習用試合データ
train_add = pd.read_csv("../input_data/jleague/train_add.csv", sep=",", header=0, quotechar="\"")#学習用試合追加データ
condition = pd.read_csv("../input_data/jleague/condition.csv", sep=",", header=0, quotechar="\"")#試合詳細データ
condition_add = pd.read_csv("../input_data/jleague/condition_add.csv", sep=",", header=0, quotechar="\"")#試合詳細追加データ
stadium = pd.read_csv("../input_data/jleague/stadium.csv", sep=",", header=0, quotechar="\"")#スタジアムデータ
test = pd.read_csv("../input_data/jleague/test.csv", sep=",", header=0, quotechar="\"")#評価用試合データ

# これは何に使うんだ...？？
add_2014 = pd.read_csv("../input_data/jleague/2014_add.csv", sep=",", header=0, quotechar="\"")  #2014年度後半試合追加データ。2014年後半の38件の対戦データ。あくまで補足データである。得点が含まれているため、モデル検証用データと組み合わせることで、その対戦時にチームが現在何位なのかが分かる


# マージする
# stadium：収容人数・所在地
# condition：スコア、天気、温度、湿度、レフリー、ホームチーム名、選手11人、アウェイチーム名、選手11人、
train_df = pd.concat([train, train_add], axis=0, ignore_index=True)
train_df = pd.merge(train_df, stadium, how="left", left_on="stadium", right_on="name")
condition_df = pd.concat([condition, condition_add], axis=0, ignore_index=True)
train_df = pd.merge(train_df, condition_df, how="left", on="id")

test_df = pd.merge(test, stadium, how="left", left_on="stadium", right_on="name")
test_df = pd.merge(test_df, condition_df, how="left", on="id")

## 特徴量エンジニアリング

In [3]:
def make_features(data):
    df = data.copy()
    
    # y=0のデータを削除して、y_capa_ratioを出す。
    if "y" in df.columns:
        df = df[df["y"] > 0].reset_index(drop=True)
        df["y_capa_ratio"] = df["y"] / df["capa"]
        df["y_capa_ratio_log"] = np.log1p(df["y_capa_ratio"])
    
    # 「ザスパ草津」を「ザスパクサツ群馬」にする。
    df[["home", "away"]] = df[["home", "away"]].replace("ザスパ草津", "ザスパクサツ群馬")
    
    # 選手名（home_01～home_11、away_01～away_11）を削除。一旦利用しない。
    df = df.drop(columns=[f"home_{str(i).zfill(2)}" for i in range(1, 12)] + [f"away_{str(i).zfill(2)}" for i in range(1, 12)])
    
    # idとrefereeも削除。一旦利用しない。
    df = df.drop(columns=["id", "referee"])
    
    # addressから都道府県名を抽出。
    df["prefecture"] = df["address"].apply(lambda x: next((pref for pref in pref_list if pref in x), np.nan))
    df = df.drop(columns=["address"])
    
    # home_team	away_teamは重複しているので削除
    df = df.drop(columns=["home_team", "away_team"])
    
    

    # 湿度と温度から不快指数を作成する。
    df["humidity_num"] = df["humidity"].str.replace("%","").astype("float")
    df["discomfort_index"] = 0.81 * df["temperature"].astype("float") + 0.01 * df["humidity_num"] * (0.99 * df["temperature"].astype("float") - 14.3) + 46.3 
    df["discomfort_category"] = pd.cut(df["discomfort_index"], bins=[0, 60, 75,100], labels=["寒い", "普通", "暑い"])
    df = df.drop(columns=["humidity","temperature","humidity_num","discomfort_index"])
    
    # match（第１節第２日）から、数字をそれぞれ抜き出す。
    # round（節）は、35-42を1つのカテゴリとする。なぜならここはJ2しかないので。残りは同じ幅で区分する
    df["round"] = df["match"].str.extract(r"第(\d+)節")[0].astype(int).astype(str)
    # df["round_cat"] = pd.cut(df["round"], bins=[0, 7, 14, 21, 28, 34, 42], labels=["1-7", "8-14", "15-21", "22-28", "29-34", "35-42"])
    # df["round_day"]   = df["match"].str.extract(r"第(\d+)日")[0].astype(int).astype(str)
    df = df.drop(columns=["match"])

    # J1フラグ
    df["J1_flg"] = df["stage"].apply(lambda x: 1 if x=="Ｊ１" else 0)

    # 日時系統
    df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["gameday"].str.extract(r"(\d+/\d+)")[0],format="%Y-%m/%d", errors="coerce")
    df["month"] = df["gameday"].apply(lambda x: int(x.split("/")[0]))
    df["day"] = df["gameday"].apply(lambda x: int(x.split("/")[1].split("(")[0]))
    df["weekday"] = df["gameday"].apply(lambda x: x.split("(")[1].replace(")",""))
    df["hour"] = df["time"].apply(lambda x: int(x.split(":")[0]))
    df["hour_category"] = pd.cut(df["hour"], bins=[0, 14, 17, 24], labels=["afternoon", "evening", "night"])
    df["is_weekend"] = df["gameday"].str.contains("[土日祝]").astype(int)
    df["season"] = pd.cut(df["month"],bins=[0, 3, 6, 9, 12],labels=["winter", "spring", "summer", "autumn"])
    df["December_flg"] = (df["month"] == 12).astype(int)
    
    # 日時系は削除 yearはtestデータでは2014年のみなので削除
    df = df.drop(columns=["date","time","year","weekday","hour","month","day"])

    # TV放送数
    df["tv_N"] = df["tv"].apply(lambda x: len(x.split("／")))
    # NHKフラグ
    df["tv_NHK_flg"] = df["tv"].apply(lambda x: 1 if "ＮＨＫ" in x else 0)
    
    # 有料無料フラグ。有料チャンネルだけから構成される →「有料」
    paid_channels = ["スカパー", "ｅ２"]
    df["tv_paid"] = (df["tv"].str.split("／").apply(lambda lst: int(all(any(k in ch for k in paid_channels) for ch in lst))))
    
    df = df.drop(columns=["tv"])
    # 天気
    df["weather_cat"] = df["weather"].apply(lambda x: x[0])
    df = df.drop(columns=["weather"])
    
    # season、hour_category、weather_catを結合したカラムを作成
    df["season_hour_weather"] = df["season"].astype(str) + "_" + df["hour_category"].astype(str) + "_" + df["weather_cat"].astype(str)
    df = df.drop(columns=["season","hour_category","weather_cat"])
    
    # home_scoreとaway_scoreとの差分を考える。絶対値にする。あんまり差がありすぎると面白くないので少ないのでは？という
    df["score_diff"] = abs(df["home_score"] - df["away_score"])
    df = df.drop(columns=["home_score","away_score"])

    df = df.drop(columns=["name","gameday"])
    return df

train_df_processed = make_features(train_df)
display(train_df_processed.head(1))

Unnamed: 0,y,stage,home,away,stadium,capa,y_capa_ratio,y_capa_ratio_log,prefecture,discomfort_category,round,J1_flg,is_weekend,December_flg,tv_N,tv_NHK_flg,tv_paid,season_hour_weather,score_diff
0,18250,Ｊ１,ベガルタ仙台,鹿島アントラーズ,ユアテックスタジアム仙台,19694,0.926678,0.655797,宮城県,寒い,1,1,1,0,4,1,0,winter_afternoon_雨,1


## 学習


In [4]:
drop_cols = ['y','y_capa_ratio','y_capa_ratio_log', 'capa']
feature_cols = [c for c in train_df_processed.columns if c not in drop_cols]
X = train_df_processed[feature_cols]
y = train_df_processed['y_capa_ratio_log']

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = [c for c in X.columns if c not in categorical_cols]

In [5]:
# カテゴリ変数はTarget Encoding。数値は何もしない。
def make_preprocess():
    return ColumnTransformer(transformers=[
        ('target_encoder',TargetEncoder(categories='auto', target_type='continuous',smooth='auto', cv=5, shuffle=True, random_state=42), categorical_cols),
        ('num', 'passthrough', num_cols)])

def make_pipeline(model):
    return Pipeline([
        ('preprocess', make_preprocess()),
        ('model', model),
    ])

# 色々なモデルを試してみる
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "Lasso": Lasso(alpha=0.1, max_iter=10000, random_state=42),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=10000, random_state=42),
    "HuberRegressor": HuberRegressor(max_iter=10000, epsilon=1.35), 
    "TheilSenRegressor": TheilSenRegressor(random_state=42, max_subpopulation=10000),
    "QuantileRegressor": QuantileRegressor(quantile=0.5, alpha=0.01, solver="highs"),
    "RandomForest": RandomForestRegressor(n_estimators=200, max_depth=5, min_samples_leaf=10,max_features="sqrt", random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=3,min_samples_leaf=10, subsample=0.8, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=3,subsample=0.8, colsample_bytree=0.8,random_state=42, verbosity=0),
    "LightGBM": LGBMRegressor(n_estimators=300, learning_rate=0.1, max_depth=5,subsample=0.8, colsample_bytree=0.8,random_state=42,verbosity=-1),
    "CatBoost": CatBoostRegressor(iterations=300, learning_rate=0.05, depth=4,random_seed=42, verbose=False),
    "SVR": SVR(C=1.0, epsilon=0.2, kernel="rbf"),
    "KNN": KNeighborsRegressor(n_neighbors=5)
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = {}
for name, model in models.items():
    pipeline = make_pipeline(model)
    scores = cross_val_score(pipeline, X, y, scoring="neg_root_mean_squared_error", cv=kf)
    rmse_mean = -scores.mean()
    results[name] = rmse_mean

results_df = pd.DataFrame(list(results.items()), columns=['モデル', '平均RMSE']).sort_values(by='平均RMSE')
display(results_df)



Unnamed: 0,モデル,平均RMSE
11,CatBoost,0.076905
8,GradientBoosting,0.077155
9,XGBoost,0.077172
10,LightGBM,0.078921
0,LinearRegression,0.079624
4,HuberRegressor,0.080148
1,Ridge,0.080945
7,RandomForest,0.083827
5,TheilSenRegressor,0.084003
6,QuantileRegressor,0.089272


In [6]:
model_name = "XGBoost"
final_model = models[model_name]
final_pipeline = make_pipeline(final_model)
final_pipeline.fit(X, y)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('target_encoder', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,target_type,'continuous'
,smooth,'auto'
,cv,5
,shuffle,True
,random_state,42

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


## 推論

In [None]:
# 特徴量エンジニアリングを同じように実施
test_df_processed = make_features(test_df)
X_test = test_df_processed[feature_cols]
display(X_test.shape)
y_pred_log_ratio = final_pipeline.predict(X_test)

# 数値安定化（オーバーフロー・NaN対策）
y_pred_log_ratio = np.nan_to_num(y_pred_log_ratio, neginf=-20.0, posinf=20.0)
y_pred_log_ratio = np.clip(y_pred_log_ratio, -20.0, 20.0)

ratio_hat = np.expm1(y_pred_log_ratio)
ratio_hat = np.clip(ratio_hat, 0.0, 1.0)

# 人数へ変換し、キャパで上限
y_capa = test_df_processed["capa"].astype(float).to_numpy()
y_pred_test_final = ratio_hat * y_capa
y_pred_test_final = np.minimum(y_pred_test_final, y_capa)

y_pred_test_final

## 提出ファイル作成

In [None]:
submit = test_df[["id"]].copy()
submit["pred"] = y_pred_test_final
time = datetime.now().strftime("%Y%m%d_%H%M")
#submit.to_csv(f"C:\\Users\\n-kozuma\\program_files\\90_JLeague\\submit\\submit_{model_name}_{time}.csv",  sep=",", index=False, header=False)
print(f"submit_{model_name}_{time}.csv")

display(submit)