In [1]:
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./data/df_rc_hr_jk_tr_r.csv", index_col=0)
df = df.drop(["trName", 'jkName', "jkNo_y", 'jkNo_x', 'trNo_x', "rcTime","hrNo", "trNo_y", "ord1CntT_x",
             "ord2CntT_x", "ord3CntT_x", "ord1CntT_y", "ord2CntT_y", "ord3CntT_y", "ord1CntT", "ord2CntT",
             "ord3CntT"], axis=1)
df = df.dropna()
df = df.loc[df.ord <= 12]


  df = pd.read_csv("./data/df_rc_hr_jk_tr_r.csv", index_col=0)


In [3]:
# Convert the text content of a rank column to a number, retaining only the numeric portion.
import re
values = df["rank"].unique()
for value in values:
    strs = re.sub(r'[^0-9]', '', value)
    df = df.replace({"rank": value}, strs)

In [4]:
df = df[df["rank"] != ""]

In [5]:
len(df)

215852

In [6]:
values = df["rank"].unique()
for value in values:
    num = int(value)
    df = df.replace({"rank": value}, num)

In [7]:
df["rank"].unique()

array([4, 3, 1, 2, 5, 6])

In [8]:
# Separate the race records into individual data frames for each race 
# and store them in the 'races' variable.
grouped = df.groupby(["rcDate", "meet", "rcNo"])
races = []
for (k1, k2, k3), group in grouped:
    races.append(group)


In [9]:
races_8 = [x for x in races if len(x) >= 8]   # Only races with 8~12 horses are selected
len(races_8)

19825

In [10]:
# Divide by grade
races_1 = [x for x in races_8 if x["rank"].mean() == 1]
races_2 = [x for x in races_8 if x["rank"].mean() == 2]
races_3 = [x for x in races_8 if x["rank"].mean() == 3]
races_4 = [x for x in races_8 if x["rank"].mean() == 4]
races_5 = [x for x in races_8 if x["rank"].mean() == 5]
races_6 = [x for x in races_8 if x["rank"].mean() == 6]

In [11]:
print(len(races_1), len(races_2), len(races_3), len(races_4), len(races_5), len(races_6))

2652 1424 2397 4333 4127 4892


### Grade 1 horse Model Evaluation

In [12]:
year = 250              # races per year
predict = []            # List of 100 game predictions after training data
predict_p = []          # List of probabilities for the winning streak
predict_b = []          # List of 100 game predictions before training data
m_feature = []          # feature importance for each evaluation

for i in range(5):       # Evaluation 5 times over different periods 
    end = i * 100 + 200
    races_m = races_1[-(year+ end):-end]
    models = []
# Make a model corresponding to the number of races for a year and train each race
    for j in range(len(races_m)):          
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)       # Include model that will participate in voting in models

    features = []
# Record the average value of feature importance of the model 
# that participated in the voting for each evaluation
    for j in range(len(models)):     
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## Test 100 races after training data
    test_li = races_1[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)   # Summing the number of No.1 prediction by index
        num = np.argmax(ords)  # Index that received the most votes is the No.1 predicted by the model
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()  
    # Based on the predicted results of 100 races, 
    # the actual number of horses ranked first, second, and third was extracted
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)  # A result of predicting the actual No.1 horse as No.1
    predict_p.append(win1 + win2 + win3)
    
    ## Training Data Tested 100 Races from 2 Years Ago
    test_li = races_1[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)

 

In [13]:
# 100 games forecast after training data
predict

[25, 24, 20, 23, 20]

In [14]:
# Training data 100 games forecast of 2 years ago
predict_b

[25, 19, 20, 17, 21]

In [15]:
# odds of winning the winning streak
predict_p

[52, 55, 45, 51, 51]

In [16]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance    

[Index(['ord1Ratio_y', 'ord2CntY', 'chulNo', 'rcCntT_y', 'winRateT'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'chulNo', 'winRateT', 'ord3Ratio_x'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'winRateT', 'chulNo', 'ord3Ratio_x'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'winRateT', 'chulNo', 'ord3Ratio_x'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'ord2CntY', 'winRateT', 'ord3Ratio_x'], dtype='object')]

### Grade 2 horse Model Evaluation

In [17]:
year = 150
predict = []
predict_p = []
predict_b = []
m_feature = []

for i in range(5):
    end = i * 100 + 200
    races_m = races_2[-(year+ end):-end]
    models = []

    for j in range(len(races_m)):
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)

    features = []
    for j in range(len(models)):
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## 훈련데이터 이후 100 경주 테스트
    test_li = races_2[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = np.argmax(ords)
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)
    predict_p.append(win1 + win2 + win3)
    
    ## 훈련데이터 2년전 100 경주 테스트
    test_li = races_2[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)


In [18]:
# 훈련데이터 이후 100경기 예측 결과
predict

[16, 19, 19, 20, 21]

In [19]:
# 훈련데이터 2년전 100경기 예측 결과
predict_b

[22, 20, 26, 19, 20]

In [20]:
# 연승식 담첨 확률
predict_p

[47, 54, 56, 43, 47]

In [21]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance    

[Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'rcCntT_y', 'age'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'ord2CntY', 'rcCntT_y'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'winRateT', 'chulNo', 'ord3Ratio_x'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'ord3Ratio_x', 'ord2Ratio', 'winRateT'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'ord3Ratio_x', 'rcCntY_y', 'ord2Ratio'], dtype='object')]

### Grade 3 horse Model Evaluation

In [22]:
year = 250
predict = []
predict_p = []
predict_b = []
m_feature = []

for i in range(5):
    end = i * 100 + 200
    races_m = races_3[-(year+ end):-end]
    models = []

    for j in range(len(races_m)):
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)

    features = []
    for j in range(len(models)):
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## 훈련데이터 이후 100 경주 테스트
    test_li = races_3[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = np.argmax(ords)
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)
    predict_p.append(win1 + win2 + win3)
    
    ## 훈련데이터 2년전 100 경주 테스트
    test_li = races_3[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)


In [23]:
# 훈련데이터 이후 100경기 예측 결과
predict

[32, 19, 22, 25, 25]

In [24]:
# 훈련데이터 2년전 100경기 예측 결과
predict_b

[20, 28, 27, 26, 24]

In [25]:
# 연승식 담첨 확률
predict_p

[64, 40, 50, 56, 56]

In [26]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance  

[Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord2CntY'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord2CntY'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'ord2CntY', 'age'], dtype='object'),
 Index(['ord1Ratio_y', 'ord2CntY', 'chulNo', 'winRateT', 'ord3Ratio_x'], dtype='object')]

### Grade 4 horse Model Evaluation

In [27]:
year = 450
predict = []
predict_p = []
predict_b = []
m_feature = []

for i in range(5):
    end = i * 100 + 200
    races_m = races_4[-(year+ end):-end]
    models = []

    for j in range(len(races_m)):
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)

    features = []
    for j in range(len(models)):
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## 훈련데이터 이후 100 경주 테스트
    test_li = races_4[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = np.argmax(ords)
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)
    predict_p.append(win1 + win2 + win3)
    
    ## 훈련데이터 2년전 100 경주 테스트
    test_li = races_4[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)


In [28]:
# 훈련데이터 이후 100경기 예측 결과
predict

[24, 27, 26, 24, 17]

In [29]:
# 훈련데이터 2년전 100경기 예측 결과
predict_b

[18, 23, 21, 20, 19]

In [30]:
# 연승식 담첨 확률
predict_p

[58, 60, 59, 49, 53]

In [31]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance  

[Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord3Ratio_y'], dtype='object')]

### Grade 5 horse Model Evaluation

In [32]:
year = 400
predict = []
predict_p = []
predict_b = []
m_feature = []

for i in range(5):
    end = i * 100 + 200
    races_m = races_5[-(year+ end):-end]
    models = []

    for j in range(len(races_m)):
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)

    features = []
    for j in range(len(models)):
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## 훈련데이터 이후 100 경주 테스트
    test_li = races_5[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = np.argmax(ords)
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)
    predict_p.append(win1 + win2 + win3)
    
    ## 훈련데이터 2년전 100 경주 테스트
    test_li = races_5[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)


In [33]:
# 훈련데이터 이후 100경기 예측 결과
predict

[30, 34, 28, 32, 22]

In [34]:
# 훈련데이터 2년전 100경기 예측 결과
predict_b

[24, 30, 26, 31, 22]

In [35]:
# 연승식 담첨 확률
predict_p

[55, 56, 51, 56, 42]

In [36]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance  

[Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'winRateT', 'chulNo', 'age', 'ord1CntY_x'], dtype='object')]

### Grade 6 horse Model Evaluation

In [37]:
year = 500
predict = []
predict_p = []
predict_b = []
m_feature = []

for i in range(5):
    end = i * 100 + 200
    races_m = races_6[-(year+ end):-end]
    models = []

    for j in range(len(races_m)):
        X = races_m[j].drop(['meet', 'rcDate','rcNo', 'ord', 'rank'], axis = 1)
        y = races_m[j]["ord"]
        model = DecisionTreeClassifier(random_state = 7)
        model.fit(X, y)
        models.append(model)

    features = []
    for j in range(len(models)):
        feature = models[j].feature_importances_
        features.append(feature)
    features = np.array(features)
    m_feature.append(features.mean(axis = 0))    
    
    ## 훈련데이터 이후 100 경주 테스트
    test_li = races_6[-end: -(end-100)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = np.argmax(ords)
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    win2 = order[1]
    win3 = order[2]
    predict.append(win1)
    predict_p.append(win1 + win2 + win3)
    
    ## 훈련데이터 2년전 100 경주 테스트
    test_li = races_6[-(3*year+end+100):-(3*year+end)]
    results = []

    for j in range(len(test_li)):
        x_test = test_li[j].drop(['meet', 'rcDate','rcNo', 'ord', "rank"], axis = 1)
        preds = []
        for k in range(len(models)):
            pred = models[k].predict(x_test)
            preds.append(pred)
        preds = np.array(preds)
        ords = (preds == 1).sum(axis = 0)
        num = ords.argmax()
        results.append(num)
    r = pd.Series(results)
    order = r.value_counts()
    win1 = order[0]
    predict_b.append(win1)


In [38]:
# 훈련데이터 이후 100경기 예측 결과
predict

[58, 47, 51, 34, 46]

In [39]:
# 훈련데이터 2년전 100경기 예측 결과
predict_b

[24, 17, 14, 27, 25]

In [40]:
# 연승식 담첨 확률
predict_p

[74, 73, 68, 64, 71]

In [41]:
# Featgure Importance
feature = []

for i in range(len(m_feature)):
    feature.append(np.argsort(m_feature[i])[::-1][:5])

importance = []    
for col in feature:
    importance.append(X.columns[col])

importance  

[Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'ord1CntY_x', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'ord1CntY_x', 'ord3Ratio_y'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'ord3Ratio_y', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'ord3Ratio_y', 'ord1CntY_x'], dtype='object'),
 Index(['ord1Ratio_y', 'chulNo', 'winRateT', 'age', 'ord1CntY_x'], dtype='object')]