In [64]:
import pandas as pd
import helper_functions as sc

from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor

In [65]:
# shut off warnings
import warnings
warnings.filterwarnings('ignore')

In [66]:
# load data
data_ready = pd.read_csv('data\\bundesliga_2018_ready.csv', index_col=0)

In [67]:
data_ready.columns

Index(['round', 'goal_difference_1', 'form_1', 'form_weighted_1',
       'form_possession_1', 'form_pass_acc_1', 'form_shot_acc_1',
       'form_duels_1', 'form_offense_1', 'goal_difference_2', 'form_2',
       'form_weighted_2', 'form_possession_2', 'form_pass_acc_2',
       'form_shot_acc_2', 'form_duels_2', 'form_offense_2', 'home_away',
       'match_id', 'target', 'target_possession', 'team_1_B', 'team_1_C',
       'team_1_D', 'team_2_B', 'team_2_C', 'team_2_D'],
      dtype='object')

## Ball-possession

In [68]:
X_possession = data_ready[['team_1_B', 'team_1_C', 'team_1_D',
                          'form_pass_acc_1', 'form_pass_acc_2', 'form_possession_1','form_possession_2',
                          'team_1_B', 'team_1_C', 'team_1_D',
                          'team_2_B', 'team_2_C', 'team_2_D',
                          'home_away']]
#team_1 = pd.get_dummies(X_possession['team_1'], prefix = 'team_1_', drop_first = True)
#team_2 = pd.get_dummies(X_possession['team_2'], prefix = 'team_2_', drop_first = True)
#X_possession = (X_possession.join(team_1)
#                            .join(team_2)
#                            .drop(['team_1', 'team_2'], axis=1))
y_possession = data_ready['target_possession']
X_possession.head()

Unnamed: 0,team_1_B,team_1_C,team_1_D,form_pass_acc_1,form_pass_acc_2,form_possession_1,form_possession_2,team_1_B.1,team_1_C.1,team_1_D.1,team_2_B,team_2_C,team_2_D,home_away
0,0,1,0,81.0,78.0,48.0,49.0,0,1,0,0,0,0,1
1,0,0,0,78.0,81.0,49.0,48.0,0,0,0,0,1,0,0
2,0,1,0,78.0,75.5,48.0,45.0,0,1,0,0,0,1,1
3,0,0,1,75.5,78.0,45.0,48.0,0,0,1,0,1,0,0
4,0,0,1,81.5,80.0,50.5,51.5,0,0,1,0,0,0,1


In [69]:
######################################
# model possession - neural network  #
######################################

mdl_mlp = MLPRegressor(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (50,), (100,), (150,), (100, 100), (100,100,100)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'r2',
                      return_train_score=True, iid=False)

search.fit(X_possession, y_possession)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
print(search.best_score_)

MLPRegressor(activation='relu', alpha=10, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.409640582685


In [70]:
possession = mdl_mlp.predict(X_possession)

## Result prediction models

In [71]:
# add predicted possession
data_ready['possession'] = possession
# get feature and target dataframe
X = data_ready.drop(['target', 'target_possession', 'round', 'match_id'], axis= 1) #'team_1', 'team_2',
y = data_ready['target']
features = list(X.columns)
X.head()

Unnamed: 0,goal_difference_1,form_1,form_weighted_1,form_possession_1,form_pass_acc_1,form_shot_acc_1,form_duels_1,form_offense_1,goal_difference_2,form_2,...,form_duels_2,form_offense_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D,possession
0,0,1.0,2.0,48.0,81.0,0.25,47.0,1.0,3,3.0,...,42.0,4.0,1,0,1,0,0,0,0,81.071727
1,3,3.0,12.0,49.0,78.0,0.8,42.0,4.0,0,1.0,...,47.0,1.0,0,0,0,0,0,1,0,78.92724
2,0,1.5,3.0,48.0,78.0,0.315,45.0,2.0,-4,0.0,...,52.0,0.5,1,0,1,0,0,0,1,78.798787
3,-4,0.0,0.0,45.0,75.5,0.085,52.0,0.5,0,1.5,...,45.0,2.0,0,0,0,1,0,1,0,74.792901
4,-4,0.0,0.0,50.5,81.5,0.165,52.5,0.5,3,3.0,...,48.5,2.5,1,0,0,1,0,0,0,79.668049


In [72]:
# Normalization
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = features

## get train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1 , test_size=0.2)
X_train.shape

# setup model list
model_list = []

In [73]:
#########################################
# Model 1 - logistic regression - Ridge #
#########################################

mdl_ridge = LogisticRegression(penalty='l2', solver='liblinear', max_iter=100, multi_class = 'auto') # Ridge

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_ridge, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)
search.fit(X_train, y_train)
mdl_ridge = search.best_estimator_

print(mdl_ridge)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list += mdl_ridge

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.461052631579


In [74]:
##################################
# Model 2 - k-neirest neighbors ##
##################################

mdl_knn = KNeighborsClassifier()  

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_neighbors': [i for i in range(1,100)]
}]

search = GridSearchCV(mdl_knn, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_knn = search.best_estimator_

print(mdl_knn)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list.append(mdl_knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=18, p=2,
           weights='uniform')
0.562105263158


In [75]:
###########################
# model 3 - random forest #
###########################

mdl_rf = RandomForestClassifier(min_samples_leaf=5, random_state=1, n_jobs=-1)

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_estimators': [500, 1000],
    'max_depth': [i for i in range(5,20)],
    'max_features': ['sqrt', 'log2']
}]

search = GridSearchCV(mdl_rf, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_rf = search.best_estimator_

print(mdl_rf)
print(search.best_score_)

feature_importance = (pd.DataFrame({'features': X_train.columns,
                                    'importance': mdl_rf.feature_importances_})
                        .sort_values('importance', ascending=False))
print(feature_importance)

if search.best_score_ > 0.5:
    model_list.append(mdl_rf)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=8, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
0.498947368421
             features  importance
16          home_away    0.085622
8   goal_difference_2    0.066528
11  form_possession_2    0.065249
23         possession    0.064373
3   form_possession_1    0.060480
9              form_2    0.060412
12    form_pass_acc_2    0.057497
6        form_duels_1    0.051790
0   goal_difference_1    0.051224
14       form_duels_2    0.047911
4     form_pass_acc_1    0.047768
2     form_weighted_1    0.047466
1              form_1    0.046939
10    form_weighted_2    0.046031
15     form_offense_2    0.042815
7      form_offense_1    0.042363
13

In [76]:
#############################
# model 4 - neural network  #
#############################

mdl_mlp = MLPClassifier(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (50,), (100,), (150,), (100, 100), (100,100,100)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
print(search.best_score_)

if search.best_score_ > 0.5:
    model_list.append(mdl_mlp)

MLPClassifier(activation='relu', alpha=1, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10,), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.589473684211


In [77]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_validate

mdl_bay = GaussianNB()

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

cv_result = cross_validate(mdl_bay, X_train, y_train, cv=kfold, scoring='accuracy')
mdl_bay.fit(X_train, y_train)

print(np.mean(cv_result['test_score']))

if np.mean(cv_result['test_score']) > 0.5:
    model_list.append(mdl_mlp)

0.370526315789


## Combine all models in ensemble

In [78]:
model_list

[KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
            metric_params=None, n_jobs=None, n_neighbors=18, p=2,
            weights='uniform'),
 MLPClassifier(activation='relu', alpha=1, batch_size=20, beta_1=0.9,
        beta_2=0.999, early_stopping=False, epsilon=1e-08,
        hidden_layer_sizes=(10,), learning_rate='invscaling',
        learning_rate_init=0.001, max_iter=200, momentum=0.9,
        n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
        random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False)]

In [79]:
#################################################
# Ensemble - setup logistic regression - lasso  #
#################################################

# get features
X_ensemble = sc.get_ensemble_x(X_train, model_list)

# train logistic regression ensemble
mdl_lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=100, multi_class = 'auto')

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [3000, 2000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_lasso, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_ensemble, y_train.reset_index(drop=True))
mdl_ensemble = search.best_estimator_

print(mdl_ensemble)
print(search.best_score_)

LogisticRegression(C=3000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.749473684211


In [80]:
############################
# TEST model performance  ##
############################
# add models to list
# model_list = [mdl_mlp, mdl_rf, mdl_knn, mdl_ridge]

X_ensemble = sc.get_ensemble_x(X_test, model_list)

coefficients = pd.DataFrame({'coefficients: ': list(X_ensemble.columns),
                             'win': mdl_ensemble.coef_[0],
                             'draw':mdl_ensemble.coef_[1],
                             'loose': mdl_ensemble.coef_[2]})

y_hat_ensemble = mdl_ensemble.predict(X_ensemble)

print('final accuracy of ensemble on test set = \n', accuracy_score(y_test, y_hat_ensemble))
print(confusion_matrix(y_test, y_hat_ensemble))  
print(classification_report(y_test, y_hat_ensemble)) 
coefficients

final accuracy of ensemble on test set = 
 0.605042016807
[[27  8  3]
 [12  7 14]
 [ 4  6 38]]
              precision    recall  f1-score   support

           0       0.63      0.71      0.67        38
           1       0.33      0.21      0.26        33
           3       0.69      0.79      0.74        48

   micro avg       0.61      0.61      0.61       119
   macro avg       0.55      0.57      0.55       119
weighted avg       0.57      0.61      0.58       119



Unnamed: 0,coefficients:,win,draw,loose
0,KNeighborsClassifier0,-1.578278,0.076092,1.314754
1,KNeighborsClassifier1,1.385975,-0.965295,0.689473
2,KNeighborsClassifier3,0.730776,0.025496,-0.953327
3,MLPClassifier0,6.136367,-3.792584,-4.302273
4,MLPClassifier1,-6.110925,6.061879,-3.402565
5,MLPClassifier3,-3.369155,-2.835019,3.422239


In [81]:
###############################
# setup threshold  
###############################

y_hat_proba = mdl_ensemble.predict_proba(X_ensemble)
bet = [any(y_hat_proba[i] > 0.8) for i in range(len(y_hat_proba))]

data_test = data_ready.loc[X_test.index,]
data_test['predict'] = y_hat_ensemble
data_test['bet'] = bet

print('number of bets = ', len(data_test[bet]), '\n',
      'accuracy = ', accuracy_score(data_test[bet]['target'], data_test[bet]['predict']), '\n',
      'confusiont matrix = ', '\n','\n',
     confusion_matrix(data_test[bet]['target'], data_test[bet]['predict']))
print(X_test.shape)
data_test.head()


number of bets =  57 
 accuracy =  0.666666666667 
 confusiont matrix =  
 
 [[17  1  1]
 [ 7  0  4]
 [ 2  4 21]]
(119, 24)


Unnamed: 0,round,goal_difference_1,form_1,form_weighted_1,form_possession_1,form_pass_acc_1,form_shot_acc_1,form_duels_1,form_offense_1,goal_difference_2,...,target_possession,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D,possession,predict,bet
559,Regular Season - 33,7,1.0,2.0,48.6,83.6,0.214,42.6,1.2,21,...,88,0,1,0,0,1,0,83.172082,1,False
347,Regular Season - 21,3,1.4,2.8,47.8,80.6,0.352,51.4,1.8,20,...,76,0,1,0,0,0,0,81.989535,1,False
396,Regular Season - 24,-13,0.6,1.2,51.6,79.0,0.324,47.0,1.2,32,...,83,0,1,0,0,0,0,83.118003,0,True
499,Regular Season - 29,-7,0.0,0.0,52.8,82.6,0.2,51.4,0.8,21,...,81,0,1,0,1,0,0,80.880639,3,False
520,Regular Season - 30,3,1.4,3.4,53.6,81.2,0.224,47.6,1.8,23,...,78,0,1,0,1,0,0,79.248862,0,True
