In [None]:
from keras.layers import BatchNormalization, Dense, Input, Dropout
from keras.models import Model
from keras import backend as K
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import sqlite3
from time import time
import tensorflow as tf
import keras
import scikitplot as skplt

## Preprocessing

In [None]:
def get_match_label(match):
    ''' Derives a label for a given match. '''
    
    #Define variables
    home_goals = match['home_team_goal']
    away_goals = match['away_team_goal']
     
    label = pd.DataFrame()
    label.loc[0,'match_api_id'] = match['match_api_id'] 

    #Identify match label  
    if home_goals > away_goals:
        label.loc[0,'label'] = "Win"
    if home_goals == away_goals:
        label.loc[0,'label'] = "Draw"
    if home_goals < away_goals:
        label.loc[0,'label'] = "Defeat"

    #Return label        
    return label.loc[0]
    
def get_fifa_stats(match, player_stats):
    ''' Aggregates fifa stats for a given match. '''    
    
    #Define variables
    match_id =  match.match_api_id
    date = match['date']
    players = ['home_player_1', 'home_player_2', 'home_player_3', "home_player_4", "home_player_5",
               "home_player_6", "home_player_7", "home_player_8", "home_player_9", "home_player_10",
               "home_player_11", "away_player_1", "away_player_2", "away_player_3", "away_player_4",
               "away_player_5", "away_player_6", "away_player_7", "away_player_8", "away_player_9",
               "away_player_10", "away_player_11"]
    player_stats_new = pd.DataFrame()
    names = []
    
    #Loop through all players
    for player in players:   
            
        #Get player ID
        player_id = match[player]
        
        #Get player stats 
        stats = player_stats[player_stats.player_api_id == player_id]
            
        #Identify current stats       
        current_stats = stats[stats.date < date].sort_values(by = 'date', ascending = False)[:1]
        
        if np.isnan(player_id) == True:
            overall_rating = pd.Series(0)
        else:
            current_stats.reset_index(inplace = True, drop = True)
            overall_rating = pd.Series(current_stats.loc[0, "overall_rating"])

        #Rename stat
        name = "{}_overall_rating".format(player)
        names.append(name)
            
        #Aggregate stats
        player_stats_new = pd.concat([player_stats_new, overall_rating], axis = 1)
    
    player_stats_new.columns = names        
    player_stats_new['match_api_id'] = match_id

    player_stats_new.reset_index(inplace = True, drop = True)
    
    #Return player stats    
    return player_stats_new.loc[0]     
      
def get_fifa_data(matches, player_stats, path = None, data_exists = False):
    ''' Gets fifa data for all matches. '''  
    
    #Check if fifa data already exists
    if data_exists == True:
        
        fifa_data = pd.read_pickle(path)
        
    else:
        
        print("Collecting fifa data for each match...")       
        start = time()
        
        #Apply get_fifa_stats for each match
        fifa_data = matches.apply(lambda x :get_fifa_stats(x, player_stats), axis = 1)
        
        end = time()    
        print("Fifa data collected in {:.1f} minutes".format((end - start)/60))
    
    #Return fifa_data
    return fifa_data

def get_overall_fifa_rankings(fifa, get_overall = False):
    ''' Get overall fifa rankings from fifa data. '''
      
    temp_data = fifa
    
    #Check if only overall player stats are desired
    if get_overall == True:
        
        #Get overall stats
        data = temp_data.loc[:,(fifa.columns.str.contains('overall_rating'))]
        data.loc[:,'match_api_id'] = temp_data.loc[:,'match_api_id']
    else:
        
        #Get all stats except for stat date
        cols = fifa.loc[:,(fifa.columns.str.contains('date_stat'))]
        temp_data = fifa.drop(cols.columns, axis = 1)        
        data = temp_data
    
    #Return data
    return data

def get_last_matches(matches, date, team, x = 10):
    ''' Get the last x matches of a given team. '''
    
    #Filter team matches from matches
    team_matches = matches[(matches['home_team_api_id'] == team) | (matches['away_team_api_id'] == team)]
                           
    #Filter x last matches from team matches
    last_matches = team_matches[team_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
    
    #Return last matches
    return last_matches
    
def get_last_matches_against_eachother(matches, date, home_team, away_team, x = 10):
    ''' Get the last x matches of two given teams. '''
    
    #Find matches of both teams
    home_matches = matches[(matches['home_team_api_id'] == home_team) & (matches['away_team_api_id'] == away_team)]    
    away_matches = matches[(matches['home_team_api_id'] == away_team) & (matches['away_team_api_id'] == home_team)]  
    total_matches = pd.concat([home_matches, away_matches])
    
    #Get last x matches
    try:    
        last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:x,:]
    except:
        last_matches = total_matches[total_matches.date < date].sort_values(by = 'date', ascending = False).iloc[0:total_matches.shape[0],:]
        
        #Check for error in data
        if(last_matches.shape[0] > x):
            print("Error in obtaining matches")
            
    #Return data
    return last_matches
    
def get_goals(matches, team):
    ''' Get the goals of a specfic team from a set of matches. '''
    
    #Find home and away goals
    home_goals = int(matches.home_team_goal[matches.home_team_api_id == team].sum())
    away_goals = int(matches.away_team_goal[matches.away_team_api_id == team].sum())

    total_goals = home_goals + away_goals
    
    #Return total goals
    return total_goals

def get_goals_conceided(matches, team):
    ''' Get the goals conceided of a specfic team from a set of matches. '''

    #Find home and away goals
    home_goals = int(matches.home_team_goal[matches.away_team_api_id == team].sum())
    away_goals = int(matches.away_team_goal[matches.home_team_api_id == team].sum())

    total_goals = home_goals + away_goals

    #Return total goals
    return total_goals

def get_wins(matches, team):
    ''' Get the number of wins of a specfic team from a set of matches. '''
    
    #Find home and away wins
    home_wins = int(matches.home_team_goal[(matches.home_team_api_id == team) & (matches.home_team_goal > matches.away_team_goal)].count())
    away_wins = int(matches.away_team_goal[(matches.away_team_api_id == team) & (matches.away_team_goal > matches.home_team_goal)].count())

    total_wins = home_wins + away_wins

    #Return total wins
    return total_wins      
    
def get_match_features(match, matches, x = 10):
    ''' Create match specific features for a given match. '''
    
    #Define variables
    date = match.date
    home_team = match.home_team_api_id
    away_team = match.away_team_api_id
    
    #Get last x matches of home and away team
    matches_home_team = get_last_matches(matches, date, home_team, x = 10)
    matches_away_team = get_last_matches(matches, date, away_team, x = 10)
    
    #Get last x matches of both teams against each other
    last_matches_against = get_last_matches_against_eachother(matches, date, home_team, away_team, x = 3)
    
    #Create goal variables
    home_goals = get_goals(matches_home_team, home_team)
    away_goals = get_goals(matches_away_team, away_team)
    home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
    away_goals_conceided = get_goals_conceided(matches_away_team, away_team)
    
    #Define result data frame
    result = pd.DataFrame()
    
    #Define ID features
    result.loc[0, 'match_api_id'] = match.match_api_id
    result.loc[0, 'league_id'] = match.league_id

    #Create match features
    result.loc[0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
    result.loc[0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
    result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team, home_team) 
    result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team, away_team)
    result.loc[0, 'games_against_won'] = get_wins(last_matches_against, home_team)
    result.loc[0, 'games_against_lost'] = get_wins(last_matches_against, away_team)
    
    #Return match features
    return result.loc[0]

def convert_odds_to_prob(match_odds):
    ''' Converts bookkeeper odds to probabilities. '''
    
    #Define variables
    match_id = match_odds.loc[:,'match_api_id']
    bookkeeper = match_odds.loc[:,'bookkeeper']    
    win_odd = match_odds.loc[:,'Win']
    draw_odd = match_odds.loc[:,'Draw']
    loss_odd = match_odds.loc[:,'Defeat']
    
    #Converts odds to prob
    win_prob = 1 / win_odd
    draw_prob = 1 / draw_odd
    loss_prob = 1 / loss_odd
    
    total_prob = win_prob + draw_prob + loss_prob
    
    probs = pd.DataFrame()
    
    #Define output format and scale probs by sum over all probs
    probs.loc[:,'match_api_id'] = match_id
    probs.loc[:,'bookkeeper'] = bookkeeper
    probs.loc[:,'Win'] = win_odd
    probs.loc[:,'Draw'] = draw_odd
    probs.loc[:,'Defeat'] = loss_odd
    
    #Return probs and meta data
    return probs
    
def get_bookkeeper_data(matches, bookkeepers, horizontal = True):
    ''' Aggregates bookkeeper data for all matches and bookkeepers. '''
    
    bk_data = pd.DataFrame()
    
    #Loop through bookkeepers
    for bookkeeper in bookkeepers:

        #Find columns containing data of bookkeeper
        temp_data = matches.loc[:,(matches.columns.str.contains(bookkeeper))]
        temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
        temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
        
        #Rename odds columns and convert to numeric
        cols = temp_data.columns.values
        cols[:3] = ['Win','Draw','Defeat']
        temp_data.columns = cols
        temp_data.loc[:,'Win'] = pd.to_numeric(temp_data['Win'])
        temp_data.loc[:,'Draw'] = pd.to_numeric(temp_data['Draw'])
        temp_data.loc[:,'Defeat'] = pd.to_numeric(temp_data['Defeat'])
        
        #Check if data should be aggregated horizontally
        if(horizontal == True):
            
            #Convert data to probs
            temp_data = convert_odds_to_prob(temp_data)
            temp_data.drop('match_api_id', axis = 1, inplace = True)
            temp_data.drop('bookkeeper', axis = 1, inplace = True)
            
            #Rename columns with bookkeeper names
            win_name = bookkeeper + "_" + "Win"
            draw_name = bookkeeper + "_" + "Draw"
            defeat_name = bookkeeper + "_" + "Defeat"
            temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]

            #Aggregate data
            bk_data = pd.concat([bk_data, temp_data], axis = 1)
        else:
            #Aggregate vertically
            bk_data = bk_data.append(temp_data, ignore_index = True)
    
    #If horizontal add match api id to data
    if(horizontal == True):
        temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
    
    #Return bookkeeper data
    return bk_data

def get_bookkeeper_probs(matches, bookkeepers, horizontal = False):
    ''' Get bookkeeper data and convert to probabilities for vertical aggregation. '''
    
    #Get bookkeeper data
    data = get_bookkeeper_data(matches, bookkeepers, horizontal = False)
    
    #Convert odds to probabilities
    probs = convert_odds_to_prob(data)
    
    #Return data
    return probs



def create_feables(matches, fifa, bookkeepers, get_overall = False, horizontal = True, x = 10, verbose = True):
    ''' Create and aggregate features and labels for all matches. '''

    #Get fifa stats features
    fifa_stats = get_overall_fifa_rankings(fifa, get_overall)
    
    
    if verbose == True:
        print("Generating match features...")
    start = time()
    
    #Get match features for all matches
    match_stats = matches.apply(lambda x: get_match_features(x, matches, x = 10), axis = 1)
    
    #Create dummies for league ID feature
    dummies = pd.get_dummies(match_stats['league_id']).rename(columns = lambda x: 'League_' + str(x))
    match_stats = pd.concat([match_stats, dummies], axis = 1)
    match_stats.drop(['league_id'], inplace = True, axis = 1)
    
    end = time()
    if verbose == True:
        print("Match features generated in {:.1f} minutes".format((end - start)/60))
    
    if verbose == True:    
        print("Generating match labels...")
    start = time()
    
    #Create match labels
    labels = matches.apply(get_match_label, axis = 1)
    end = time()
    if verbose == True:
        print("Match labels generated in {:.1f} minutes".format((end - start)/60))
        
    if verbose == True:    
        print("Generating bookkeeper data...")
    start = time()
    
    #Get bookkeeper quotas for all matches
    bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal = True)
    bk_data.loc[:,'match_api_id'] = matches.loc[:,'match_api_id']
    end = time()
    if verbose == True:
        print("Bookkeeper data generated in {:.1f} minutes".format((end - start)/60))


    
    #Get bookkeeper quotas for all matches
    
    #Merges features and labels into one frame
    features = pd.merge(match_stats, fifa_stats, on = 'match_api_id', how = 'left')
    features = pd.merge(features, bk_data, on = 'match_api_id', how = 'left')
    feables = pd.merge(features, labels, on = 'match_api_id', how = 'left')
    
    #Drop NA values
    feables.dropna(inplace = True)
    
    #Return preprocessed data
    return feables

In [None]:
database = "database.sqlite"
conn = sqlite3.connect(database)
player_data = pd.read_sql("SELECT * FROM Team;", conn)
player_stats_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)
team_data = pd.read_sql("SELECT * FROM Team;", conn)
match_data = pd.read_sql("SELECT * FROM Match;", conn)

In [None]:
rows = ["country_id", "league_id", "season", "stage", "date", "match_api_id", "home_team_api_id", 
        "away_team_api_id", "home_team_goal", "away_team_goal", "home_player_1", "home_player_2",
        "home_player_3", "home_player_4", "home_player_5", "home_player_6", "home_player_7", 
        "home_player_8", "home_player_9", "home_player_10", "home_player_11", "away_player_1",
        "away_player_2", "away_player_3", "away_player_4", "away_player_5", "away_player_6",
        "away_player_7", "away_player_8", "away_player_9", "away_player_10", "away_player_11"]
match_data.dropna(subset = rows, inplace = True)

In [None]:
fifa_data = get_fifa_data(match_data, player_stats_data, data_exists = False)
bk_cols = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_cols_selected = ['B365']

In [None]:
data = create_feables(match_data, fifa_data, bk_cols_selected, get_overall = True)

## Working with 6 classes

In [None]:
def get_data(data, x = 1):
    X = data.values[:, 1:-(1 + 3 * (x - 1))]
    y = data.values[:, -1]
    y_full = np.zeros((X.shape[0], 9))
    for i, y_i in enumerate(y):
        if y_i == "Win":
            y_full[i, 0] = 1.0
            y_full[i, 1] = 1.0
        if y_i == "Defeat":
            y_full[i, 3] = 1.0
            y_full[i, 4] = 1.0
        if y_i == "Draw":
            y_full[i, 1] = 1.0
            y_full[i, 2] = 1.0
            y_full[i, 3] = 1.0
        y_full[i, 6] = X[i, -3] # ADD ODDS OF HOME TEAM
        y_full[i, 7] = X[i, -2] # ADD ODDS OF DRAW
        y_full[i, 8] = X[i, -1] # ADD ODDS OF AWAY TEAM
    return X, y_full, y

In [None]:
X, y, outcome = get_data(data)
train_x, test_x, train_y, test_y, = train_test_split(X,  y)

In [None]:
def odds_loss(y_true, y_pred):

    win_home_team = y_true[:, 0:1]
    win_home_or_draw = y_true[:, 1:2]
    draw = y_true[:, 2:3]
    win_away_or_draw = y_true[:, 3:4]
    win_away = y_true[:, 4:5]
    no_bet = y_true[:, 5:6]

    odds_win = y_true[:, 6:7]
    odds_draw = y_true[:, 7:8]
    odds_defeat = y_true[:, 8:9]

    gain_loss_vector = K.concatenate([win_home_team * (odds_win - 1) + (1 - win_home_team) * -1,
                                      (win_home_or_draw * ((odds_win + odds_draw)/4 - 1) + (1 - win_home_or_draw) * -1)*1,
                                      draw * (odds_draw - 1) + (1 - draw) * -1,
                                      (win_away_or_draw * ((odds_draw + odds_defeat)/4 - 1) + (1 - win_away_or_draw) * -1)*1,
                                      win_away * (odds_defeat - 1) + (1 - win_away) * -1,
                                      K.zeros_like(odds_win)], axis=1)
    
    return -1 * K.mean(K.sum(gain_loss_vector * y_pred, axis=1))
 
true = K.variable(np.array([[1, 1, 0, 0, 0, 0, 2.0, 3.0, 3.6], [0, 1, 1, 1, 0, 0, 5.0, 3.0, 1]]), dtype='float32')
pred = K.variable(np.array([[0.6, 0.1, 0.2, 0.05, 0.05, 0.0], [0, 0.1, 0.9, 0, 0, 0]]), dtype='float32')

K.eval(odds_loss(true, pred))

In [None]:
def count_profit_correctly(true, predicitons, x):
    win = x[:, -3]
    draw = x[:, -2]
    defe = x[:, -1]
    profit = 0
    for i in range(true.shape[0]):
        if predictions[i][1] == 1:
            if true[i][0] == 1:
                profit += 0.5 * win[i] - 1
            elif true[i][2] == 1:
                profit += 0.5 * draw[i] - 1
            else: profit -= 1
        elif predictions[i][3] == 1:
            if true[i][4] == 1:
                profit += 0.5 * defe[i] - 1
            elif true[i][2] == 1:
                profit += 0.5 * draw[i] - 1
            else: profit -= 1
        elif predictions[i][0] == 1:
            if true[i][0] == 1:
                profit += win[i] - 1
            else:
                profit -= 1
        elif predictions[i][2] == 1:
            if true[i][2] == 1:
                profit += draw[i] - 1
            else:
                profit -= 1
        elif predictions[i][4] == 1:
            if true[i][4] == 1:
                profit += defe[i] - 1
            else:
                profit -= 1
        else:
            profit += 0
    return float(profit/true.shape[0])


In [None]:
def get_model(input_dim, output_dim, base=1000, multiplier=0.25, p=0.2):
    inputs = Input(shape=(input_dim,))
    l = BatchNormalization()(inputs)
    l = Dropout(p)(l)
    n = base
    l = Dense(n, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int(n * multiplier)
    l = Dense(n, activation='relu')(l)
    outputs = Dense(output_dim, activation='softmax')(l)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.00001), loss=odds_loss)
    return model

In [None]:
model = get_model(train_x.shape[1], 6, 1000, 0.8, 0.1)
history = model.fit(train_x, train_y, validation_data=(test_x, test_y), shuffle = True,
          epochs=50, batch_size=100, callbacks=[EarlyStopping(patience=5),
                                                ModelCheckpoint('odds_loss.hdf5',
                                                                save_best_only=True)])

In [None]:
predictions = model.predict(test_x)

In [None]:
for j in range(predictions2.shape[0]):
    i = np.argmax(predictions2[j])
    predictions2[j] = np.zeros(6)
    predictions2[j][i]=1

In [None]:
skplt.metrics.plot_confusion_matrix(test_y, pred,  normalize=True, )

## 3 classes

In [None]:
def get_data3_with_no_odds(data, x = 1):
    #data = pd.read_csv('extract-betsentiment-com.csv')
    X = data.values[:, 1:-(1 + 3 * (x - 1))]
    y = data.values[:, -1]
    y_full = np.zeros((X.shape[0], 3))
    for i, y_i in enumerate(y):
        if y_i == "Win":
            y_full[i, 0] = 1
        if y_i == "Defeat":
            y_full[i, 2] = 1
        if y_i == "Draw":
            y_full[i, 1] = 1
    return X, y_full, y

In [None]:
X, y, outcome = get_data3_with_no_odds(data)
train_x, test_x, train_y, test_y, = train_test_split(X,  y, test_size = 0.25)

In [None]:
def get_model3(input_dim, output_dim, base=10000, multiplier=0.25, p=0.2):
    inputs = Input(shape=(input_dim,))
    l = BatchNormalization()(inputs)
    l = Dropout(p)(l)
    n = base
    l = Dense(base, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int(n * multiplier)
    l = Dense(n, activation='relu')(l)
    outputs = Dense(output_dim, activation='softmax')(l)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer= keras.optimizers.Adam(learning_rate=0.0001),  loss='categorical_crossentropy')
    return model

In [None]:
def count_profit(true, pred, x):
    odds_win = x[:, -3:-2]
    odds_draw = x[:, -2:-1]
    odds_defeat = x[:, -1].reshape((x.shape[0], 1))

    profit = 0

    for i in range(pred.shape[0]):
      
        profit += (pred[i][0] * ((odds_win[i] - 1) * true[i][0] - 1*(1-true[i][0])) + 
                 pred[i][1] * ((odds_draw[i] - 1) * true[i][1] - 1*(1-true[i][1]))+
                 pred[i][2] * ((odds_defeat[i] - 1) * true[i][2] - 1*(1-true[i][2])))
    
    
    return float(profit/pred.shape[0])

def count_binary_accuracy(y, predictions):
    sum = 0
    for i in range(y.shape[0]):
        sum = sum + y[i][0] * predictions[i][0] + y[i][1] * predictions[i][1] + y[i][2] * predictions[i][2]
    return float(sum/y.shape[0])

In [None]:
model = get_model3(train_x.shape[1], 3, 1000, 0.8, 0.1)
history = model.fit(train_x, train_y, validation_data=(test_x, test_y),
          epochs=200, batch_size=50, callbacks=[EarlyStopping(patience=30),
                                                ModelCheckpoint('odds_loss3.hdf5',
                                                                save_best_only=True)])

In [None]:
predictions = model.predict(test_x)

To make a bet on the only one outcome we apply next:

In [None]:
for j in range(predictions.shape[0]):
    i = np.argmax(predictions[j])
    predictions[j] = np.zeros(3)
    predictions[j][i]=1

In [None]:
s = count_profit(test_y, predictions, test_x)
s

## Catboost

In [None]:
def get_data3_with_no_odds(data, x = 1):
    #data = pd.read_csv('extract-betsentiment-com.csv')
    X = data.values[:, 1:-(1 + 3 * (x - 1))]
    y = data.values[:, -1]
    y_full = np.zeros((X.shape[0], 3))
    for i, y_i in enumerate(y):
        if y_i == "Win":
            y_full[i, 0] = 1
        if y_i == "Defeat":
            y_full[i, 2] = 1
        if y_i == "Draw":
            y_full[i, 1] = 1
        # y_full[i, 3] = X[i, -3] # ADD ODDS OF HOME TEAM
        # y_full[i, 4] = X[i, -2] # ADD ODDS OF AWAY TEAM
        # y_full[i, 5] = X[i, -1] # ADD ODDS OF AWAY TEAM

    return X, y_full, y

def count_profit(true_, pred_, x):
    true = redata_new(true_)
    pred = redata_new(pred_)


    odds_win = x[:, -3:-2]
    odds_draw = x[:, -2:-1]
    odds_defeat = x[:, -1].reshape((x.shape[0], 1))
    # odds_defeat = x[:, -2:-1]
    profit = 0

    for i in range(pred.shape[0]):
      # print(pred[i][0], )
      # print(odds_win[i])
      # print(true[i][0])

      profit += (pred[i][0] * ((odds_win[i] - 1) * true[i][0] - 1*(1-true[i][0])) + 
                 pred[i][1] * ((odds_draw[i] - 1) * true[i][1] - 1*(1-true[i][1]))+
                 pred[i][2] * ((odds_defeat[i] - 1) * true[i][2] - 1*(1-true[i][2])))
    
    
    return float(profit/pred.shape[0])

def count_binary_accuracy(y_, predictions_):
    y = redata_new(y_)
    predictions = redata_new(predictions_)
    sum = 0
    for i in range(y.shape[0]):
        sum = sum + y[i][0] * predictions[i][0] + y[i][1] * predictions[i][1] + y[i][2] * predictions[i][2]
    return float(sum/y.shape[0])

def redata(data):
    out = np.zeros([data.shape[0], 1])
    for i in range(data.shape[0]):
        out[i] = np.argmax(data[i])
    return out

def redata_new(data):
    out = np.zeros([data.shape[0], 3])
    for i in range(data.shape[0]):
        if data[i] == 0:
            out[i][0] = 1
        elif data[i] == 2:
            out[i][2] = 1
        else:
            out[i][1] = 1
    return out


def relabel(data):
    output = []
    for i in range(data.shape[0]):
        if data[i] == 0:
            output.append('Win Home')
        elif data[i] == 2:
            output.append('Win Away')
        else:
            output.append('Draw')
    return output

In [None]:
from catboost import CatBoostClassifier

In [None]:
X, y, outcome = get_data3_with_no_odds(data)
train_x, test_x, train_y, test_y, = train_test_split(X,  y)

In [None]:
Y = redata(y)

In [None]:
train_x, test_x, train_y, test_y, = train_test_split(X,  Y, test_size = 0.25)

In [None]:
model = CatBoostClassifier(
    iterations = 2000,
    learning_rate = 0.001,
    depth = 16,
    task_type="GPU",
    loss_function='MultiClass',
    custom_loss = 'Accuracy'
)
model.fit(
    train_x, train_y,
    cat_features = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
    eval_set = (test_x, test_y),
    verbose = 50,
    plot = True
)

In [None]:
pred = model.predict(test_x)

In [None]:
count_profit(test_y, pred, test_x)

In [None]:
count_binary_accuracy(test_y, pred)

In [None]:
skplt.metrics.plot_confusion_matrix(relabel(test_y), relabel(pred),  normalize=True, )