In [20]:
import pandas as pd
import helper_functions as sc

from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

In [21]:
# shut off warnings
import warnings
warnings.filterwarnings('ignore')

In [22]:
# load data
data_ready = pd.read_csv('data\\bundesliga_2017_ready.csv', index_col=0)

Unnamed: 0,round,goal_difference_1,form_1,form_weighted_1,form_possession_1,form_pass_acc_1,form_shot_acc_1,form_duels_1,form_offense_1,goal_difference_2,...,form_shot_acc_2,form_duels_2,form_offense_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D
0,Bundesliga - 2,-1,0.0,0.0,48.0,84.0,0.0,50.0,0.0,1,...,0.2,49.0,1.0,1,0,1,0,0,1,0
1,Bundesliga - 2,1,3.0,6.0,39.0,70.0,0.2,49.0,1.0,-1,...,0.0,50.0,0.0,0,0,1,0,0,1,0
2,Bundesliga - 2,-2,0.5,1.0,54.5,86.5,0.435,54.0,1.5,1,...,0.415,44.5,1.5,1,0,1,0,1,0,0
3,Bundesliga - 2,1,2.0,6.0,49.0,83.0,0.415,44.5,1.5,-2,...,0.435,54.0,1.5,0,1,0,0,0,1,0
4,Bundesliga - 2,-1,0.5,1.0,45.0,76.0,0.0,52.0,0.0,-2,...,0.165,49.5,0.5,1,0,1,0,0,1,0


## Ball-possession

In [None]:
X_possession = data_ready[['team_1', 'team_2', 'team_1_B', 'team_1_C', 'team_1_D',
                          'form_pass_acc_1', 'form_pass_acc_2', 'form_possession_1','form_possession_2',
                          'team_1_B', 'team_1_C', 'team_1_D',
                          'team_2_B', 'team_2_C', 'team_2_D',
                          'home_away']]
team_1 = pd.get_dummies(X_possession['team_1'], prefix = 'team_1_', drop_first = True)
team_2 = pd.get_dummies(X_possession['team_2'], prefix = 'team_2_', drop_first = True)
X_possession = (X_possession.join(team_1)
                            .join(team_2)
                            .drop(['team_1', 'team_2'], axis=1))
y_possession = data_ready['target_possession']
X_possession.head()

In [None]:
######################################
# model possession - neural network  #
######################################

mdl_mlp = MLPRegressor(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (50,), (100,), (150,), (100, 100), (100,100,100)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'r2',
                      return_train_score=True, iid=False)

search.fit(X_possession, y_possession)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
print(search.best_score_)

In [65]:
possession = mdl_mlp.predict(X_possession)

## Result prediction models

In [74]:
# add predicted possession
data_ready['posession'] = possession
# get feature and target dataframe
X = data_ready.drop(['target', 'target_possession', 'team_1', 'team_2', 'round'], axis= 1)
y = data_ready['target']
features = list(X.columns)
X.head()

Unnamed: 0,goal_difference_1,form_1,form_weighted_1,form_possession_1,form_pass_acc_1,form_shot_acc_1,form_duels_1,form_offense_1,goal_difference_2,form_2,...,form_duels_2,form_offense_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D,posession
0,-1,0.0,0.0,48.0,84.0,0.0,50.0,0.0,1,3.0,...,49.0,1.0,1,0,1,0,0,1,0,83.961162
1,1,3.0,6.0,39.0,70.0,0.2,49.0,1.0,-1,0.0,...,50.0,0.0,0,0,1,0,0,1,0,74.400684
2,-2,0.5,1.0,54.5,86.5,0.435,54.0,1.5,1,2.0,...,44.5,1.5,1,0,1,0,1,0,0,88.917676
3,1,2.0,6.0,49.0,83.0,0.415,44.5,1.5,-2,0.5,...,54.0,1.5,0,1,0,0,0,1,0,83.871326
4,-1,0.5,1.0,45.0,76.0,0.0,52.0,0.0,-2,1.5,...,49.5,0.5,1,0,1,0,0,1,0,75.278856


In [75]:
# Normalization
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = features

## get train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1 , test_size=0.2)
X_train.shape

# setup model list
model_list = []

  return self.partial_fit(X, y)


(475, 24)

In [76]:
#########################################
# Model 1 - logistic regression - Ridge #
#########################################

mdl_ridge = LogisticRegression(penalty='l2', solver='liblinear', max_iter=100, multi_class = 'auto') # Ridge

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_ridge, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)
search.fit(X_train, y_train)
mdl_ridge = search.best_estimator_

print(mdl_ridge)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list += mdl_ridge

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.452631578947


In [77]:
##################################
# Model 2 - k-neirest neighbors ##
##################################

mdl_knn = KNeighborsClassifier()  

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_neighbors': [i for i in range(1,100)]
}]

search = GridSearchCV(mdl_knn, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_knn = search.best_estimator_

print(mdl_knn)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list += mdl_knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')
0.56


In [78]:
###########################
# model 3 - random forest #
###########################

mdl_rf = RandomForestClassifier(min_samples_leaf=5, random_state=1, n_jobs=-1)

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_estimators': [500, 1000],
    'max_depth': [i for i in range(5,15)],
    'max_features': ['sqrt', 'log2']
}]

search = GridSearchCV(mdl_rf, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_rf = search.best_estimator_

print(mdl_rf)
print(search.best_score_)

feature_importance = (pd.DataFrame({'features': X_train.columns,
                                    'importance': mdl_rf.feature_importances_})
                        .sort_values('importance', ascending=False))
print(feature_importance)

if search.best_score_ > 0.5:
    model_list += mdl_rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
0.488421052632
             features  importance
11  form_possession_2    0.069173
23          posession    0.067495
13    form_shot_acc_2    0.066218
8   goal_difference_2    0.061531
12    form_pass_acc_2    0.061137
5     form_shot_acc_1    0.059933
3   form_possession_1    0.055430
0   goal_difference_1    0.054956
4     form_pass_acc_1    0.054806
6        form_duels_1    0.054668
14       form_duels_2    0.052912
9              form_2    0.049600
16          home_away    0.047801
10    form_weighted_2    0.045092
15     form_offense_2    0.043213
1              form_1    0.043108
2

In [79]:
#############################
# model 4 - neural network  #
#############################

mdl_mlp = MLPClassifier(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (50,), (100,), (150,), (100, 100), (100,100,100)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list += mdl_mlp

MLPClassifier(activation='relu', alpha=1, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.570526315789


In [106]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

mdl_bay = GaussianNB()

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

cv_result = cross_validate(mdl_bay, X_train, y_train, cv=kfold, scoring='accuracy')
mdl_bay.fit(X_train, y_train)

print(np.mean(cv_result['test_score']))

if np.mean(cv_result['test_score']) > 0.5:
    model_list += mdl_mlp

{'fit_time': array([ 0.01213121,  0.01006985,  0.01277208,  0.02133107,  0.00964499]),
 'score_time': array([ 0.00556946,  0.00400949,  0.01030326,  0.01537848,  0.00715303]),
 'test_score': array([ 0.41052632,  0.37894737,  0.33684211,  0.30526316,  0.35789474]),
 'train_score': array([ 0.40789474,  0.42894737,  0.43157895,  0.42368421,  0.44210526])}

## Combine all models in ensemble

In [98]:
#################################################
# Ensemble - setup logistic regression - lasso  #
#################################################

# get features
X_ensemble = sc.get_ensemble_x(X_train, model_list)

# train logistic regression ensemble
mdl_lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=100, multi_class = 'auto')

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [3000, 2000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_lasso, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_ensemble, y_train.reset_index(drop=True))
mdl_ensemble = search.best_estimator_

print(mdl_ensemble)
print(search.best_score_)

LogisticRegression(C=3000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.770526315789


In [101]:
############################
# TEST model performance  ##
############################
# add models to list
# model_list = [mdl_mlp, mdl_rf, mdl_knn, mdl_ridge]

X_ensemble = sc.get_ensemble_x(X_test, model_list)

coefficients = pd.DataFrame({'coefficients: ': list(X_ensemble.columns),
                             'win': mdl_ensemble.coef_[0],
                             'draw':mdl_ensemble.coef_[1],
                             'loose': mdl_ensemble.coef_[2]})

y_hat_ensemble = mdl_ensemble.predict(X_ensemble)

print('final accuracy of ensemble on test set = \n', accuracy_score(y_test, y_hat_ensemble))
print(confusion_matrix(y_test, y_hat_ensemble))  
print(classification_report(y_test, y_hat_ensemble)) 
coefficients

final accuracy of ensemble on test set = 
 0.571428571429
[[20 10  8]
 [10 10 10]
 [ 4  9 38]]
              precision    recall  f1-score   support

           0       0.59      0.53      0.56        38
           1       0.34      0.33      0.34        30
           3       0.68      0.75      0.71        51

   micro avg       0.57      0.57      0.57       119
   macro avg       0.54      0.53      0.53       119
weighted avg       0.57      0.57      0.57       119



Unnamed: 0,coefficients:,win,draw,loose
0,MLPClassifier0,2.462184,-0.335152,-3.198042
1,MLPClassifier1,-0.042985,2.759518,-1.853913
2,MLPClassifier3,-1.652956,-1.278413,1.117791
3,KNeighborsClassifier0,3.1504,-4.312125,-1.794209
4,KNeighborsClassifier1,-3.375575,3.345089,-3.410533
5,KNeighborsClassifier3,-2.076278,-2.830589,2.330769


In [102]:
###############################
# setup threshold  
###############################

y_hat_proba = mdl_ensemble.predict_proba(X_ensemble)
bet = [any(y_hat_proba[i] > 0.8) for i in range(len(y_hat_proba))]

data_test = data_ready.loc[X_test.index,]
data_test['predict'] = y_hat_ensemble
data_test['bet'] = bet

print('number of bets = ', len(data_test[bet]), '\n',
      'accuracy = ', accuracy_score(data_test[bet]['target'], data_test[bet]['predict']), '\n',
      'confusiont matrix = ', '\n','\n',
     confusion_matrix(data_test[bet]['target'], data_test[bet]['predict']))
print(X_test.shape)
data_test.head()


number of bets =  55 
 accuracy =  0.672727272727 
 confusiont matrix =  
 
 [[12  2  4]
 [ 3  3  5]
 [ 2  2 22]]
(119, 24)


Unnamed: 0,round,team_1,goal_difference_1,form_1,form_weighted_1,form_possession_1,form_pass_acc_1,form_shot_acc_1,form_duels_1,form_offense_1,...,target_possession,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D,posession,predict,bet
559,Bundesliga - 33,164,-14,1.6,2.8,49.6,76.2,0.24,48.8,1.2,...,69,0,1,0,0,0,0,75.526551,3,False
347,Bundesliga - 21,157,35,3.0,12.0,59.2,84.4,0.39,51.8,3.0,...,85,0,0,0,0,0,1,83.112593,0,True
396,Bundesliga - 24,164,-13,1.2,1.8,38.8,72.6,0.308,48.8,1.4,...,63,0,0,1,0,1,0,74.229099,1,False
499,Bundesliga - 29,172,-8,1.6,3.2,45.6,73.4,0.27,45.2,1.2,...,71,0,1,0,0,0,0,71.2097,3,True
520,Bundesliga - 30,164,-15,1.0,1.6,49.6,77.8,0.174,48.0,0.6,...,78,0,1,0,0,0,1,76.970095,3,True
