In [1]:
import pandas as pd
import numpy as np
import helper_functions as sc

from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
data_ready = pd.read_csv('data\\bundesliga_2016_ready.csv', index_col=0)

X_train = data_ready.drop(['target'], axis= 1)
y_train = data_ready['target']
features = list(X_train.columns)

# Normalization
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
X_train.columns = features
X_train.head()

  return self.partial_fit(X, y)


Unnamed: 0,goal_difference_1,form_1,form_weighted_1,goal_difference_2,form_2,form_weighted_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D
0,0.413462,1.0,1.0,0.298077,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.298077,0.0,0.0,0.413462,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.365385,1.0,0.75,0.346154,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.346154,0.0,0.0,0.365385,1.0,0.75,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.365385,1.0,0.75,0.346154,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


## train models on last season

In [20]:
# ridge regression
mdl_ridge = LogisticRegression(C=1, penalty='l2', solver='liblinear', max_iter=100, multi_class = 'auto') # Ridge
mdl_ridge.fit(X_train,y_train)

# knn
mdl_knn = KNeighborsClassifier(n_neighbors=29)  
mdl_knn.fit(X_train,y_train)

# randomforest
mdl_rf = RandomForestClassifier(min_samples_leaf=5, random_state=1, n_jobs=-1,
                                n_estimators=1000, max_depth=5)
mdl_rf.fit(X_train,y_train)

# neural network
mdl_mlp = MLPClassifier(hidden_layer_sizes = (10, ), alpha = 10,
                        activation='relu', solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)
mdl_mlp.fit(X_train,y_train)


# ensemble
X_ensemble = sc.get_ensemble_x(X_train, mdl_ridge, mdl_knn, mdl_rf, mdl_mlp)

# train ensemble model
mdl_lasso = LogisticRegression(C = 10, penalty='l1', solver='liblinear',
                               max_iter=100, multi_class = 'auto')
mdl_lasso.fit(X_ensemble, y_train)


LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

## Simulation of next season

In [21]:
data_new = pd.read_csv('data\\bundesliga_2017_ready.csv', index_col=0)
results = pd.DataFrame()

print('starting simulation')

for gameday in data_new['round'].unique():
    gameday_data = data_new[data_new['round'] == gameday]
    
    X = gameday_data.drop(['target', 'round'], axis= 1)
    y = gameday_data['target']
    features = list(X.columns)
    
    # Normalization
    X = scaler.transform(X)
    X = pd.DataFrame(X)
    X.columns = features
    
    X_ensemble = sc.get_ensemble_x(X, mdl_ridge, mdl_knn, mdl_rf, mdl_mlp)
    
    y_hat = mdl_lasso.predict(X_ensemble)
    y_hat_proba = mdl_lasso.predict_proba(X_ensemble)
    bet = [any(y_hat_proba[i] > 0.8) for i in range(len(y_hat_proba))]

    results = results.append(pd.DataFrame({'gameday': list(np.repeat(gameday, len(y))),
                                           'y':y,
                                           'y_hat':y_hat,
                                           'bet': bet}))
    
    # retrain models
    X_train = X_train.append(X)
    y_train = y_train.append(y)
    
    mdl_ridge.fit(X_train, y_train)
    mdl_knn.fit(X_train, y_train)
    mdl_rf.fit(X_train, y_train)
    mdl_mlp.fit(X_train, y_train)
    
    X_ensemble = sc.get_ensemble_x(X_train, mdl_ridge, mdl_knn, mdl_rf, mdl_mlp)
    
    mdl_lasso.fit(X_ensemble, y_train)
    
    print('finished simulating gameday: ', gameday)

starting simulation
finished simulating gameday:  Bundesliga - 1
finished simulating gameday:  Bundesliga - 2
finished simulating gameday:  Bundesliga - 3
finished simulating gameday:  Bundesliga - 4
finished simulating gameday:  Bundesliga - 5
finished simulating gameday:  Bundesliga - 6
finished simulating gameday:  Bundesliga - 7
finished simulating gameday:  Bundesliga - 8
finished simulating gameday:  Bundesliga - 9
finished simulating gameday:  Bundesliga - 10
finished simulating gameday:  Bundesliga - 11
finished simulating gameday:  Bundesliga - 12
finished simulating gameday:  Bundesliga - 13
finished simulating gameday:  Bundesliga - 14
finished simulating gameday:  Bundesliga - 15
finished simulating gameday:  Bundesliga - 16
finished simulating gameday:  Bundesliga - 17
finished simulating gameday:  Bundesliga - 18
finished simulating gameday:  Bundesliga - 19
finished simulating gameday:  Bundesliga - 20
finished simulating gameday:  Bundesliga - 21
finished simulating gam

## Accuracy 

In [22]:
print('final accuracy of ensemble on test set =' , round(accuracy_score(results['y'], results['y_hat']), 4))
print('\n confusion matrix: \n', confusion_matrix(results['y'], results['y_hat']))  
print('\n', classification_report(results['y'], results['y_hat'])) 

results_bet = results[results['bet'] == True]
print('final accuracy of ensemble on test set with threshold = ',
      round(accuracy_score(results_bet['y'], results_bet['y_hat']), 4))
print('number of bets = ', len(results_bet), '\n max bets: ', len(results))

final accuracy of ensemble on test set = 0.7353

 confusion matrix: 
 [[172  31  20]
 [ 29 106  31]
 [ 17  34 172]]

               precision    recall  f1-score   support

           0       0.79      0.77      0.78       223
           1       0.62      0.64      0.63       166
           3       0.77      0.77      0.77       223

   micro avg       0.74      0.74      0.74       612
   macro avg       0.73      0.73      0.73       612
weighted avg       0.74      0.74      0.74       612

final accuracy of ensemble on test set with threshold =  0.902
number of bets =  255 
 max bets:  612
