In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import XGBClassifier

In [None]:
# uploading 2015-2022 database
from google.colab import files

uploaded = files.upload()
pitch_data_all = pd.read_csv('pitch_data_2015_2022 - pitch_data_2015_2022.csv')

In [None]:
# uploading 2023 database

uploaded = files.upload()
pitch_data_2023 = pd.read_csv('pitch_data_2023 - pitch_data_2023.csv')

In [None]:
# 訓練データ
# 全特徴量
#x_train = pitch_data_all.drop(['Name','Lev','Tm','Injury','Year'], axis=1)

# Filter法で選択した特徴量
#x_train = pitch_data_all[['SO', 'W', 'IP', 'Pit', 'AB', 'SO/W', 'BF', 'BB', 'H', 'GS', 'GDP', 'R', '2B', 'ER', 'L', 'HR', 'G', 'SB', 'SO9', 'StS', 'CS', '3B', 'HBP', 'Arm', 'Shoulder', 'Str', 'SF', 'PO']]

# Wrapper法(Step forward)で選択した特徴量
x_train = pitch_data_all[['W', 'Shoulder', 'Arm', 'Elbow', 'Ribs', 'Oblique', 'Finger', 'Ankle', 'Hip', 'Achilles', 'Wrist', 'Calf', 'Hand', 'Leg', 'Shin']]

# Wrapper法(Back forward)で選択した特徴量
#x_train = pitch_data_all[]

y_train = pitch_data_all['Injury']

# 検証データ
# 全特徴量
#x_test = pitch_data_2023.drop(['Name','Lev','Tm','Injury','Year'], axis=1)

# Filter法で選択した特徴量
#x_test = pitch_data_2023[['SO', 'W', 'IP', 'Pit', 'AB', 'SO/W', 'BF', 'BB', 'H', 'GS', 'GDP', 'R', '2B', 'ER', 'L', 'HR', 'G', 'SB', 'SO9', 'StS', 'CS', '3B', 'HBP', 'Arm', 'Shoulder', 'Str', 'SF', 'PO']]

# Wrapper法で選択した特徴量
x_test = pitch_data_2023[['W', 'Shoulder', 'Arm', 'Elbow', 'Ribs', 'Oblique', 'Finger', 'Ankle','Hip', 'Achilles', 'Wrist', 'Calf', 'Hand', 'Leg', 'Shin']]

# Wrapper法(Back forward)で選択した特徴量
#x_test = pitch_data_2023[]

y_test = pitch_data_2023['Injury']


# ロジスティック回帰で学習する
model1 = LogisticRegression()
model1.fit(x_train, y_train)

# ランダムフォレスト
model2 = RandomForestClassifier()
model2.fit(x_train, y_train)

# k近傍探索
model3 = KNeighborsClassifier(n_neighbors=4)
model3.fit(x_train, y_train)

#XGBoost
model4 = XGBClassifier(booster="gbtree",             # ブースター種類（ツリーモデル：gbtree or dart, 線形モデル：gblinear)
                          learning_rate=1,              # 過学習防止を目的とした学習率
                          min_split_loss=0,             # 決定木の葉ノード追加に伴う損失減少の下限値
                          max_depth=6,                  # 決定木の深さの最大値
                          min_child_weight=1,           # 決定木の葉に必要な重みの下限
                          subsample=1,                  # 各決定木においてランダム抽出されるサンプル割合
                          sampling_method="uniform",    # サンプリング方法
                          colsample_bytree=1,           # 各決定木でランダムに設定される説明変数の割合
                          colsample_bylevel=1,          # 決定木が深さ単位で分割される際に利用する説明変数の割合
                          reg_lambda=1,                 # L2正則化のペナルティ項
                          reg_alpha=0,                  # L1正則化のペナルティ項
                          tree_method="auto",           # ツリー構造アルゴリズム
                          process_type="default",       # 実行するブースティングプロセス
                          grow_policy="depthwise",      # 新しい葉ノードを木に追加する際の制御ポリシー
                          max_leaves=0,                 # 追加する葉ノードの最大数
                          objective="reg:squarederror", # 学習プロセスで最小化を目指す損失関数
                          num_round=9,                  # ブースティング回数(=作成する決定木の本数)
                         )


# モデル学習
model4.fit(x_train, y_train)

In [None]:
# 予測結果出力
y_pred1 = model1.predict(x_test)
print('LogisticRegression:', y_pred1[:30])

y_pred2 = model2.predict(x_test)
print('RandomForest:', y_pred2[:30])

y_pred3 = model3.predict(x_test)
print('KNeighbors:', y_pred3[:30])

y_pred4 = model4.predict(x_test)
print('XGBoost:', y_pred4[:30])

LogisticRegression: [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0]
RandomForest: [0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0]
KNeighbors: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
XGBoost: [0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]


In [None]:
# モデル評価

# Accuracy
from sklearn.metrics import accuracy_score
print('accuracy_score:')
print('LogisticRegression:', accuracy_score(y_test, y_pred1))

print('RandomForest:', accuracy_score(y_test, y_pred2))

print('KNeighbors:', accuracy_score(y_test, y_pred3))

print('XGBoost:', accuracy_score(y_test, y_pred4))

# AUC
from sklearn.metrics import roc_auc_score
print('AUC:')
y_pred_prob = model1.predict_proba(x_test)[:,1]
print('LogisticRegression:', roc_auc_score(y_test, y_pred_prob))

y_pred_prob = model2.predict_proba(x_test)[:,1]
print('RandomForest:', roc_auc_score(y_test, y_pred_prob))

y_pred_prob = model3.predict_proba(x_test)[:,1]
print('KNeighbors:', roc_auc_score(y_test, y_pred_prob))

y_pred_prob = model4.predict_proba(x_test)[:,1]
print('XGBoost:', roc_auc_score(y_test, y_pred_prob))

# F score
from sklearn.metrics import f1_score
print('F score:')
print('LogisticRegression:', f1_score(y_test, y_pred1))

print('RandomForest:', f1_score(y_test, y_pred2))

print('KNeighbors:', f1_score(y_test, y_pred3))

print('XGBoost:', f1_score(y_test, y_pred4))

accuracy_score:
LogisticRegression: 0.6590126291618829
RandomForest: 0.6601607347876005
KNeighbors: 0.661308840413318
XGBoost: 0.6681974741676234
AUC:
LogisticRegression: 0.6601463443535964
RandomForest: 0.6329985780305724
KNeighbors: 0.6093464865505391
XGBoost: 0.6138553146107358
F score:
LogisticRegression: 0.28433734939759037
RandomForest: 0.3451327433628319
KNeighbors: 0.21333333333333332
XGBoost: 0.35634743875278396
