In [4]:
import pandas as pd
import helper_functions as sc

from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [8]:
# shut off warnings
import warnings
warnings.filterwarnings('ignore')

In [55]:
# load data
data_ready = pd.read_csv('data\\bundesliga_2016_ready.csv', index_col=0)
X = data_ready.drop(['target'], axis= 1)
y = data_ready['target']
features = list(X.columns)
X.head()

Unnamed: 0,goal_difference_1,form_1,form_weighted_1,goal_difference_2,form_2,form_weighted_2,home_away,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D
0,6,3.0,12.0,-6,0.0,0.0,1,0,0,0,0,0,1
1,-6,0.0,0.0,6,3.0,12.0,0,0,0,1,0,0,0
2,1,3.0,9.0,-1,0.0,0.0,1,1,0,0,0,1,0
3,-1,0.0,0.0,1,3.0,9.0,0,0,1,0,1,0,0
4,1,3.0,9.0,-1,0.0,0.0,1,1,0,0,0,1,0


In [24]:
features_less = ['goal_difference_1', 'form_1', 'form_weighted_1', 'goal_difference_2', 'form_2', 'form_weighted_2',
                 'home_away', 'team_1_B', 'team_1_C', 'team_1_D', 'team_2_B', 'team_2_C', 'team_2_D']

In [56]:
# Normalization
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)
X.columns = features

## get train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1 , test_size=0.2)
X_train.shape

(489, 13)

In [57]:
#########################################
# Model 1 - logistic regression - Ridge #
#########################################

mdl_ridge = LogisticRegression(penalty='l2', solver='liblinear', max_iter=100, multi_class = 'auto') # Ridge

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_ridge, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)
search.fit(X_train, y_train)
mdl_ridge = search.best_estimator_

print(mdl_ridge)
print(search.best_score_)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.631874605512


In [58]:
##################################
# Model 2 - k-neirest neighbors ##
##################################

mdl_knn = KNeighborsClassifier()  

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_neighbors': [i for i in range(1,100)]
}]

search = GridSearchCV(mdl_knn, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_knn = search.best_estimator_

print(mdl_knn)
print(search.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=29, p=2,
           weights='uniform')
0.603219019567


In [59]:
###########################
# model 3 - random forest #
###########################

mdl_rf = RandomForestClassifier(min_samples_leaf=5, random_state=1, n_jobs=-1)

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'n_estimators': [500, 1000],
    'max_depth': [i for i in range(5,15)],
    'max_features': ['sqrt', 'log2']
}]

search = GridSearchCV(mdl_rf, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_rf = search.best_estimator_

print(mdl_rf)
print(search.best_score_)

feature_importance = (pd.DataFrame({'features': X_train.columns,
                                    'importance': mdl_rf.feature_importances_})
                        .sort_values('importance', ascending=False))
print(feature_importance)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)
0.623669261519
             features  importance
4              form_2    0.243271
1              form_1    0.216095
2     form_weighted_1    0.142101
5     form_weighted_2    0.139128
3   goal_difference_2    0.084622
0   goal_difference_1    0.081906
6           home_away    0.029983
11           team_2_C    0.013649
8            team_1_C    0.013012
9            team_1_D    0.011656
12           team_2_D    0.009592
7            team_1_B    0.008770
10           team_2_B    0.006215


In [60]:
#############################
# model 4 - neural network  #
#############################

mdl_mlp = MLPClassifier(hidden_layer_sizes = (50, 50), activation='relu', alpha=0.0001, solver='lbfgs',
                        batch_size=20, learning_rate='invscaling', learning_rate_init=0.001)

kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'hidden_layer_sizes': [(10,), (10,10), (10,10,10), (50,), (100,), (150,), (100, 100), (100,100,100)],
    'alpha': [0.0001, 0.001, 0.01, 1, 10]
}]

search = GridSearchCV(mdl_mlp, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_train, y_train)
mdl_mlp = search.best_estimator_

print(mdl_mlp)
print(search.best_score_)

MLPClassifier(activation='relu', alpha=10, batch_size=20, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100, 100), learning_rate='invscaling',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
0.631853566169


In [61]:
#################################################
# Ensemble - setup logistic regression - lasso  #
#################################################

# get features
X_ensemble = sc.get_ensemble_x(X_train, mdl_ridge, mdl_knn, mdl_rf, mdl_mlp)

# train logistic regression ensemble
mdl_lasso = LogisticRegression(penalty='l1', solver='liblinear', max_iter=100, multi_class = 'auto')

# folds
kfold = KFold(n_splits=5, shuffle = True, random_state = 1)

# gridsearch cv
param_grid = [{
    'C': [3000, 2000, 1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]
}]

search = GridSearchCV(mdl_lasso, param_grid, cv=kfold, 
                      scoring = 'accuracy',
                      return_train_score=True, iid=False)

search.fit(X_ensemble, y_train.reset_index(drop=True))
mdl_ensemble = search.best_estimator_

print(mdl_ensemble)
print(search.best_score_)

LogisticRegression(C=3000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l1', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)
0.791394908479


In [62]:
############################
# TEST model performance  ##
############################

# rename!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
X_ensemble = sc.get_ensemble_x(X_test, mdl_ridge, mdl_knn, mdl_rf, mdl_mlp)

coefficients = pd.DataFrame({'coefficients: ': list(X_ensemble.columns),
                             'win': mdl_ensemble.coef_[0],
                             'draw':mdl_ensemble.coef_[1],
                             'loose': mdl_ensemble.coef_[2]})

y_hat_ensemble = mdl_ensemble.predict(X_ensemble)

print('final accuracy of ensemble on test set = \n', accuracy_score(y_test, y_hat_ensemble))
print(confusion_matrix(y_test, y_hat_ensemble))  
print(classification_report(y_test, y_hat_ensemble)) 
coefficients

final accuracy of ensemble on test set = 
 0.609756097561
[[33 13  8]
 [ 6 15  7]
 [ 5  9 27]]
              precision    recall  f1-score   support

           0       0.75      0.61      0.67        54
           1       0.41      0.54      0.46        28
           3       0.64      0.66      0.65        41

   micro avg       0.61      0.61      0.61       123
   macro avg       0.60      0.60      0.60       123
weighted avg       0.64      0.61      0.62       123



Unnamed: 0,coefficients:,win,draw,loose
0,rf_0,6.086667,-7.186963,-4.414383
1,rf_1,-11.958909,29.18293,-16.338491
2,rf_3,-5.518061,-10.309337,10.544481
3,ridge_0,-7.672097,-1.991763,10.707208
4,ridge_1,0.517813,2.155514,1.728506
5,ridge_3,13.182106,0.518852,-7.987559
6,knn_0,-0.606418,-0.455724,-0.724177
7,knn_1,1.458766,-8.134102,3.979462
8,knn_3,0.226655,0.996273,-2.010228
9,mlp_0,5.728086,3.299817,-9.812164


In [63]:
###############################
# setup threshold  
###############################

y_hat_proba = mdl_ensemble.predict_proba(X_ensemble)
bet = [any(y_hat_proba[i] > 0.8) for i in range(len(y_hat_proba))]

data_test = data_ready.loc[X_test.index,]
data_test['predict'] = y_hat_ensemble
data_test['bet'] = bet

print('number of bets = ', len(data_test[bet]), '\n',
      'accuracy = ', accuracy_score(data_test[bet]['target'], data_test[bet]['predict']), '\n',
      'confusiont matrix = ', '\n','\n',
     confusion_matrix(data_test[bet]['target'], data_test[bet]['predict']))
print(X_test.shape)
data_test.head()


number of bets =  61 
 accuracy =  0.803278688525 
 confusiont matrix =  
 
 [[28  2  3]
 [ 2  2  3]
 [ 2  0 19]]
(123, 13)


Unnamed: 0,goal_difference_1,form_1,form_weighted_1,goal_difference_2,form_2,form_weighted_2,home_away,target,team_1_B,team_1_C,team_1_D,team_2_B,team_2_C,team_2_D,predict,bet
493,23,2.0,0.64,-23,2.0,3.2,0,0,0,0,0,0,1,0,3,False
472,8,1.4,4.0,-2,0.4,0.8,1,3,1,0,0,0,1,0,1,False
107,0,1.4,0.12,-4,0.6,0.6,0,0,0,1,0,0,0,1,1,False
558,6,1.0,2.6,1,2.6,0.52,1,3,0,1,0,1,0,0,1,False
535,30,2.0,0.56,-5,1.4,2.8,0,3,0,0,0,0,1,0,3,False
