In [72]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report

In [73]:
# load training and testing data
df_train = pd.read_csv('training.csv')
df_test = pd.read_csv('testing.csv')

# drop meaningless first column
df_train.drop(columns='Unnamed: 0', inplace=True)
df_test.drop(columns='Unnamed: 0', inplace=True)

In [74]:
# preprocess X, y
def getInput(df, target):
    selected_features = (
        df['actual_weight'].values,
        df['declared_horse_weight'].values,
        df['draw'].values,
        df['recent_ave_rank'].values,
        df['jockey_ave_rank'].values,
        df['trainer_ave_rank'].values,
        df['race_distance'].values
    )
    scaler = preprocessing.StandardScaler()
    X_standardized = scaler.fit_transform(selected_features)
    X = np.stack(X_standardized, axis=1)
    y = df[target]
    return X, y

# print performance
def printPerformance(y_true, y_predict):
    tn, fp, fn, tp = confusion_matrix(y_true, y_predict).ravel()
    print('TN: {} | FP: {}'.format(tn, fp))
    print('FN: {} | TP: {}'.format(fn, tp))
    print(classification_report(y_true, y_predict))

# calculate ground truth
def getTrueLabel(df):
    horseWin = []
    horseRankTop3 = []
    horseRankTop50Percent = []
    start = 0
    while start != len(df):
        end = start
        while end < len(df) and df['race_id'][start] == df['race_id'][end]:
            end += 1
        for i in range(start, end):
            horse = df.iloc[i]
            if horse['finishing_position'] == 1:
                horseWin.append(1)
            else:
                horseWin.append(0)
            if horse['finishing_position'] <= 3:
                horseRankTop3.append(1)
            else:
                horseRankTop3.append(0)
            if horse['finishing_position'] < (end-start)/2:
                horseRankTop50Percent.append(1)
            else:
                horseRankTop50Percent.append(0)
        start = end
    return horseWin, horseRankTop3, horseRankTop50Percent

horseWin, horseRankTop3, horseRankTop50Percent = getTrueLabel(df_train)
df_train['HorseWin'] = horseWin
df_train['HorseRankTop3'] = horseRankTop3
df_train['HorseRankTop50Percent'] = horseRankTop50Percent
horseWin, horseRankTop3, horseRankTop50Percent = getTrueLabel(df_test)
df_test['HorseWin'] = horseWin
df_test['HorseRankTop3'] = horseRankTop3
df_test['HorseRankTop50Percent'] = horseRankTop50Percent

In [75]:
# logistic regression CV
lr_model = LogisticRegressionCV(cv=10, class_weight='balanced', refit=False)
# HorseWin
X_train, y_train = getInput(df_train, 'HorseWin')
lr_model.fit(X_train, y_train)
y_predict_train = lr_model.predict(X_train)
X_test, y_test = getInput(df_test, 'HorseWin')
y_predict_test = lr_model.predict(X_test)
printPerformance(y_train, y_predict_train)
printPerformance(y_test, y_predict_test)
# HorseRankTop3
X_train, y_train = getInput(df_train, 'HorseRankTop3')
lr_model.fit(X_train, y_train)
y_predict_train = lr_model.predict(X_train)
X_test, y_test = getInput(df_test, 'HorseRankTop3')
y_predict_test = lr_model.predict(X_test)
printPerformance(y_train, y_predict_train)
printPerformance(y_test, y_predict_test)
# HorseRankTop50Percent
X_train, y_train = getInput(df_train, 'HorseRankTop50Percent')
lr_model.fit(X_train, y_train)
y_predict_train = lr_model.predict(X_train)
X_test, y_test = getInput(df_test, 'HorseRankTop50Percent')
y_predict_test = lr_model.predict(X_test)
printPerformance(y_train, y_predict_train)
printPerformance(y_test, y_predict_test)

TN: 13783 | FP: 7834
FN: 731 | TP: 1152
             precision    recall  f1-score   support

          0       0.95      0.64      0.76     21617
          1       0.13      0.61      0.21      1883

avg / total       0.88      0.64      0.72     23500

TN: 3247 | FP: 2139
FN: 136 | TP: 342
             precision    recall  f1-score   support

          0       0.96      0.60      0.74      5386
          1       0.14      0.72      0.23       478

avg / total       0.89      0.61      0.70      5864

TN: 11082 | FP: 6783
FN: 2184 | TP: 3451
             precision    recall  f1-score   support

          0       0.84      0.62      0.71     17865
          1       0.34      0.61      0.43      5635

avg / total       0.72      0.62      0.65     23500

TN: 2696 | FP: 1738
FN: 363 | TP: 1067
             precision    recall  f1-score   support

          0       0.88      0.61      0.72      4434
          1       0.38      0.75      0.50      1430

avg / total       0.76      0.64    

In [76]:
# Naïve Bayes
