ライブラリインポート

In [1]:
import pandas as pd
from sklearn import preprocessing
from tqdm import tqdm
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
import warnings
import random
import pickle


データセットの前処理

In [2]:
#データセットのロード
dataset = pd.read_pickle('./data/dataset2.pkl')
#使用する列名を指定
resultCol = [
    '日付','raceId','枠番','馬番','horseId','性','年齢','斤量',
    'jockeyId','単勝','人気','trainerId','拠点','馬体重','体重増減',
    '出走間隔','ハンデ','着順','R','コース種','コース回り','距離','天気',
    '馬場','開催場所','グレード','制限'
]
recordCol = [
    'R','頭数','枠番','馬番','単勝','人気','着順','jockeyId','斤量',
    'タイム','着差','上り','馬体重','体重増減','出走間隔','コース種',
    'コース回り','距離','天気','馬場','開催場所','グレード','制限','ハンデ'
]
pedCol = ['pedId_' + str(i) for i in range(0,62)]
#前N走分戦績の列名を生成
recordCol9 = []
for i in range(1, 10):
    tmpList = list(map(lambda x: x + '_' + str(i), recordCol))
    recordCol9 += tmpList
#列名を合体
COLUMNS = resultCol + recordCol9 + pedCol
#データセット
dataset = dataset[COLUMNS]

In [3]:
#ラベルエンコーディング関数の定義
def labelEncode(df, target, recflg=False):
    #複数列のラベルエンコーディング関数の定義
    def listEncoder(tdf, le, cols):
        #データフレームのコピー
        tdf_ = tdf.copy()
        #列名から値を取り出す
        encoList = []
        for col in cols:
            encoList += tdf_[col].unique().tolist()
        #エンコーダーを生成
        le.fit(encoList)
        #複数列分ループ
        for col in tqdm(cols, desc=cols[0]):
            #欠損データ以外の列を取り出す
            notNull = tdf_[col][tdf_[col].notnull()]
            #エンコード実行してindexをキーにデータフレームに書き込む
            tdf_[col] = pd.Series(le.transform(notNull), index=notNull.index)
            #エンコードした列はcategory列に変換
            tdf_[col] = tdf_[col].astype('category')
        return tdf_, le
    #データフレームのコピー
    tdf = df.copy()
    #ラベルエンコーダーをインスタンス
    le = preprocessing.LabelEncoder()
    #戦績かどうかで分岐
    if not recflg:
        #リストかどうかで分岐
        if type(target) != list:
            #エンコーダーの生成
            le.fit(tdf[target])
            #欠損データ以外の列を取り出す
            notNull = tdf[target][tdf[target].notnull()]
            #エンコード実行してindexをキーにデータフレームに書き込む
            tdf[target] = pd.Series(le.transform(notNull), index=notNull.index)
            #エンコードした列はcategory列に変換
            tdf[target] = tdf[target].astype('category')
        else:
            #戦績以外で複数データだったら複数列エンコードの実行
            tdf, le = listEncoder(tdf, le, target)
    else:
        #戦績データは列名にサフィックスを付与したリストを生成
        cols9 = [target] + [target + '_' + str(i) for i in range(1, 10)]
        #複数列エンコードの実行
        tdf, le = listEncoder(tdf, le, cols9)
    #データフレームとエンコーダーをreeturn
    return tdf, le

#データフレームコピー
df = dataset.copy()
#カテゴリ変数をラベルエンコード
horseList = ['horseId'] + ['pedId_' + str(i) for i in range(0,62)]
df, leHorse = labelEncode(df,horseList)
df, leGender = labelEncode(df,'性')
df, leTrainer = labelEncode(df,'trainerId')
df, leHomeBase = labelEncode(df,'拠点')
df, leJockey = labelEncode(df,'jockeyId',recflg=True)
df, leHandi = labelEncode(df,'ハンデ',recflg=True)
df, leType = labelEncode(df,'コース種',recflg=True)
df, leDir = labelEncode(df,'コース回り',recflg=True)
df, leWether = labelEncode(df,'天気',recflg=True)
df, leCondition = labelEncode(df,'馬場',recflg=True)
df, lePlace = labelEncode(df,'開催場所',recflg=True)
df, leGrade = labelEncode(df,'グレード',recflg=True)
df, leRegulation = labelEncode(df,'制限',recflg=True)
#量的変数の列名を生成
numericCols = ['年齢']
cols1 = ['枠番','馬番','単勝','人気','斤量','馬体重',
         '体重増減','出走間隔','着順','R','距離']
cols2 = ['頭数','着順','タイム','着差','上り']
numericCols += cols1
cols3 = cols1 + cols2
for i in range(1,10):
    numericCols += map(lambda x: x + '_' + str(i),cols3)
#量的変数に対して片変数を実行
for col in tqdm(numericCols):
    df[col] = df[col].astype(float)
    

horseId: 100%|██████████| 63/63 [00:04<00:00, 12.76it/s]
jockeyId: 100%|██████████| 10/10 [00:00<00:00, 29.23it/s]
ハンデ: 100%|██████████| 10/10 [00:00<00:00, 42.51it/s]
コース種: 100%|██████████| 10/10 [00:00<00:00, 49.91it/s]
コース回り: 100%|██████████| 10/10 [00:00<00:00, 49.82it/s]
天気: 100%|██████████| 10/10 [00:00<00:00, 51.05it/s]
馬場: 100%|██████████| 10/10 [00:00<00:00, 44.67it/s]
開催場所: 100%|██████████| 10/10 [00:00<00:00, 50.03it/s]
グレード: 100%|██████████| 10/10 [00:00<00:00, 47.01it/s]
制限: 100%|██████████| 10/10 [00:00<00:00, 43.45it/s]
100%|██████████| 156/156 [00:00<00:00, 624.80it/s]


学習実行

In [4]:
#着順から正解列を生成
df['Accu'] = df['着順'].map(lambda x: 1 if x <= 3 else 0)
#日付をキーに訓練データと検証データに分割
sepdt = '2023/01/01'
train = df[df['日付']<sepdt]
test = df[df['日付']>=sepdt]

  df['Accu'] = df['着順'].map(lambda x: 1 if x <= 3 else 0)


In [5]:
#それぞれ教師データと訓練データに分割
train_x = train.drop(['日付','着順','raceId','Accu'],axis=1)
train_y = train['Accu']
test_x = test.drop(['日付','着順','raceId','Accu'],axis=1)
test_y = test['Accu']


In [6]:
test_x

Unnamed: 0,枠番,馬番,horseId,性,年齢,斤量,jockeyId,単勝,人気,trainerId,...,pedId_52,pedId_53,pedId_54,pedId_55,pedId_56,pedId_57,pedId_58,pedId_59,pedId_60,pedId_61
41796,1.0,1.0,41422,1,2.0,55.0,50,23.9,5.0,130,...,5586,1014,5585,8705,1340,1381,3906,4992,994,4991
41797,2.0,2.0,41288,2,2.0,55.0,96,61.8,7.0,188,...,4856,1049,4855,354,1436,1077,4200,6906,550,4371
41798,3.0,3.0,41322,1,2.0,53.0,135,18.8,4.0,194,...,5118,1182,5117,17759,1599,1027,4723,384,1486,5493
41799,4.0,4.0,42683,1,2.0,55.0,155,16.6,3.0,162,...,6014,1030,5403,7145,1342,1204,4365,98,1425,4020
41800,5.0,5.0,41609,2,2.0,55.0,108,1.2,1.0,37,...,5385,996,4301,10864,16954,761,194,10863,561,713
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76887,6.0,8.0,33579,1,4.0,53.0,146,3.5,1.0,163,...,4856,1049,4855,17593,527,1341,102,16757,383,16270
76888,7.0,9.0,38961,2,3.0,55.0,62,15.2,8.0,107,...,4781,1536,4529,9962,107,1014,4301,9961,599,9960
76889,7.0,10.0,36732,1,3.0,50.0,131,23.1,9.0,95,...,6313,1560,4775,20039,1729,1031,5115,7930,1421,4880
76890,8.0,11.0,38892,0,3.0,55.0,10,5.0,3.0,95,...,4511,1349,4512,3283,1614,1075,4236,6167,1575,4631


In [7]:
#モデルをインスタンスして学習の実行
model = lgb.LGBMClassifier()
model.fit(train_x,train_y)
#学習モデルの保存
with open('model.pkl', mode='wb') as f:
    pickle.dump(model,f,protocol=2)

[LightGBM] [Info] Number of positive: 9239, number of negative: 32557
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045392 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 57815
[LightGBM] [Info] Number of data points in the train set: 41796, number of used features: 302
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221050 -> initscore=-1.259559
[LightGBM] [Info] Start training from score -1.259559


### 学習モデルの評価
Accuracy score:単純正解率。
0を0と予測したものも正解とカウントさてるから単純に全部0と予想しても7割ぐらいの正解率になるからなし

Precision score:精度。1に分類したものが実際に1だった割合
Recall score:検出率。1のものを1として分類できた割合
F1 score:PrecisionとRecallを複合したスコア

In [8]:
eval_df = test.copy()
eval_df[eval_df['raceId']=='202301010301']

Unnamed: 0,日付,raceId,枠番,馬番,horseId,性,年齢,斤量,jockeyId,単勝,...,pedId_53,pedId_54,pedId_55,pedId_56,pedId_57,pedId_58,pedId_59,pedId_60,pedId_61,Accu
42093,2023/07/29,202301010301,1.0,1.0,42065,2,2.0,55.0,106,10.5,...,1228,4734,10431,1536,1075,4403,6396,1632,4698,0
42094,2023/07/29,202301010301,2.0,2.0,43113,2,2.0,52.0,143,232.3,...,1038,4316,291,1027,1031,4066,6326,1536,6325,0
42095,2023/07/29,202301010301,3.0,3.0,43296,2,2.0,55.0,108,13.6,...,1075,5818,6464,2065,1027,5414,6463,572,4141,0
42096,2023/07/29,202301010301,4.0,4.0,42727,2,2.0,55.0,155,1.2,...,1560,6438,9740,1433,1420,4452,3305,1528,6023,1
42097,2023/07/29,202301010301,5.0,5.0,42759,2,2.0,55.0,4,6.3,...,1806,5317,3162,1763,1031,5244,6477,1781,6476,0
42098,2023/07/29,202301010301,6.0,6.0,43074,1,2.0,55.0,47,24.2,...,991,6006,9392,1745,1467,5161,9391,1446,9390,1
42099,2023/07/29,202301010301,7.0,7.0,41640,2,2.0,55.0,61,24.7,...,1722,7568,18892,459,1421,5318,17508,139,16985,1
42100,2023/07/29,202301010301,8.0,8.0,42400,2,2.0,53.0,135,24.5,...,1806,5317,17871,1678,1528,4957,28,1031,5242,0
42101,2023/07/29,202301010301,8.0,9.0,42370,1,2.0,52.0,136,350.0,...,1038,4285,3093,1528,1506,4511,5586,1014,5585,0


In [12]:
#検証データに対して予測実行
prad = model.predict(test_x)
#結果可視化の為に元のデータセットをコピー
eval_df = test.copy()

#予測結果を列として追加
eval_df['pred'] = prad
#1となる確率を列として追加
eval_df['proba'] = model.predict_proba(test_x)[:,1]
#raceIdでグルーピングで1となる確率が高い順にランク付けを実行
eval_df['予測着順'] = eval_df.groupby('raceId')['proba'].rank(ascending=False)
#的中率の計算
accCount = len(eval_df[(eval_df['予測着順']==1)&(eval_df['着順']==1)])
raceCount = len(eval_df['raceId'].unique())
#回収率の計算
bet = raceCount * 100
ret = int(eval_df[(eval_df['予測着順']==1)&(eval_df['着順']==1)]['単勝'].sum()*100)

print(test_x)

# #各種評価スコアの表示
# print('Accuracy score\t: {}'.format(accuracy_score(prad,test_y)))
# print('Precision score\t: {}'.format(precision_score(prad,test_y)))
# print('Recall score\t: {}'.format(recall_score(prad,test_y)))
# print('F1 score\t: {}'.format(f1_score(prad,test_y)))
# #的中率と回収率の出力
# print('的中率\t\t:{}/{}({:0.1f}%)'.format(accCount,raceCount,accCount / raceCount * 100))
# print('回収率\t\t:{}/{}({:0.1f}%)'.format(ret,bet,ret / bet * 100))


Accuracy score	: 0.7877820834283109
Precision score	: 0.31919034643830285
Recall score	: 0.5277837373954087
F1 score	: 0.3978007761966365
的中率		:694/2568(27.0%)
回収率		:207539/256800(80.8%)


In [13]:
warnings.simplefilter('ignore')

eval_df['正規化値'] = eval_df.groupby('raceId')['proba']\
    .apply(lambda x: preprocessing.minmax_scale(x)).explode().tolist()
eval_df[eval_df['raceId']=='202301010301']
eval_df['標準化分散値'] = eval_df.groupby('raceId')['proba']\
    .apply(lambda x: preprocessing.scale(x)).explode().tolist()
view_df = eval_df[['raceId','日付','馬番','馬名','horseId','枠番','単勝','人気','着順','予測着順',
                   'pred','proba','正規化値','標準化分散値']]
marks = []
for dat in tqdm (view_df.iterrows(), total=len(view_df)):
    mark = ''
    chk1 = dat[1]['着順']
    chk2 = dat[1]['予測着順']
    chk3 = dat[1]['pred']
    if chk1 <= 3 and chk2 <= 3:
        mark = '◯'
    if chk2 == 1 and chk2 == 1:
        mark = '◎'
    marks.append(mark)
    view_df['予測印'] = view_df['pred'].map(lambda x: '◯' if x == 1 else '')
view_df['的中印'] = marks

KeyError: "['馬名'] not in index"

In [None]:
view_df[view_df['raceId']==random.choice(view_df['raceId'].unique())].sort_values('着順')

Unnamed: 0,raceId,日付,馬番,horseId,枠番,単勝,人気,着順,予測着順,pred,proba,正規化値,標準化分散値,予測印,的中印
48262,202304010604,2023/05/14,1.0,29817,1.0,4.9,3.0,1.0,5.0,0,0.26485,0.354375,0.156788,,
48267,202304010604,2023/05/14,6.0,30157,5.0,2.6,1.0,2.0,1.0,1,0.738669,1.0,2.199317,◯,◎
48273,202304010604,2023/05/14,12.0,32537,8.0,3.5,2.0,3.0,2.0,1,0.620591,0.839106,1.690305,◯,◯
48265,202304010604,2023/05/14,4.0,28309,4.0,10.2,5.0,4.0,6.0,0,0.225356,0.300561,-0.013463,,
48266,202304010604,2023/05/14,5.0,28596,5.0,5.5,4.0,5.0,3.0,0,0.333653,0.448126,0.453382,,
48272,202304010604,2023/05/14,11.0,29977,8.0,67.7,9.0,6.0,9.0,0,0.05491,0.068311,-0.748218,,
48264,202304010604,2023/05/14,3.0,32169,3.0,29.2,6.0,7.0,7.0,0,0.078809,0.100876,-0.645193,,
48271,202304010604,2023/05/14,10.0,30500,7.0,357.6,12.0,8.0,11.0,0,0.007949,0.004323,-0.950655,,
48270,202304010604,2023/05/14,9.0,32097,7.0,41.5,7.0,9.0,4.0,0,0.306006,0.410455,0.334203,,
48263,202304010604,2023/05/14,2.0,33354,2.0,41.6,8.0,10.0,8.0,0,0.070139,0.089062,-0.68257,,


In [None]:

view_df = df[['raceId','馬名','馬番','枠番','予測着順']]

KeyError: "['馬名', '予測着順'] not in index"

In [10]:
datas = []
for dr,dn,db,dw,dy in zip(view_df['raceId'],view_df['馬名'],view_df['馬番'],view_df['枠番'],view_df['予測着順']):
  datas += [dr,dn,int(db),int(dw),int(dy)]

print(datas)

NameError: name 'view_df' is not defined

In [None]:
view_df = view_df[['raceId','馬番','枠番','予測着順']]
view_lists =view_df[view_df['raceId']==random.choice(view_df['raceId'].unique())].sort_values('予測着順').to_numpy().tolist()

view_list = view_lists[0]+view_lists[1]+view_lists[2]+view_lists[3]+view_lists[4]
# view_list = view_list[['raceId','馬番','枠番','予測着順']]
# for i in view_list :
#     view_lists[i] = int(view_list[i])
view_lists['馬番','枠番','予測着順'] = (view_lists['馬番','枠番','予測着順'])
print(view_list)

TypeError: list indices must be integers or slices, not tuple