In [55]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

When importing dataset, by sorting with MatchID, dataset is now timeseries dataset. Every feature is (home-away) and result is decided with score diff. We have total 50 feature differences.

In [56]:
data=pd.read_csv('Data/Dataset.csv').drop(['Unnamed: 0'],axis=1).sort_values(by=['MatchID'])
df_home=data.filter(regex='home').rename(columns=lambda x:x.replace('_home',''))
df_away=data.filter(regex='away').rename(columns=lambda x:x.replace('_away',''))
df_diff=df_home-df_away
cond=[(df_diff['Score']<0),(df_diff['Score']==0),(df_diff['Score']>0)]
val=['Away','Draw','Home']
df_diff['Result']=np.select(cond,val)
df_diff=df_diff.drop(columns=['Score'])
X=df_diff.iloc[:,:-1]
y=df_diff.iloc[:,-1:].values.ravel()

## Feature selection

In [57]:
forest=RandomForestClassifier()
forest.fit(X_scaled,y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, X.columns[indices][f], importances[indices[f]]))

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=45)
plt.xlim([-1, X.shape[1]])
plt.show()

Feature ranking:
1. feature Clean sheets (0.092)
2. feature Interceptions (0.085)
3. feature Headed Clearance (0.074)
4. feature Recoveries (0.069)
5. feature Last man tackles (0.069)
6. feature Clearances (0.068)
7. feature Goals Conceded (0.064)
8. feature Successful 50/50s (0.064)
9. feature Duels won (0.062)
10. feature Blocked shots (0.061)
11. feature Tackles (0.061)
12. feature Clearances off line (0.060)
13. feature Tackle success % (0.059)
14. feature Aerial battles won (0.057)
15. feature Duels lost (0.056)


IndexError: index 15 is out of bounds for axis 0 with size 15

In [36]:
X=X.drop(columns=X.columns[indices[15:]])

In [58]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Train,Test,Validation set split
Split train,test,validation set into 0.8,0.1,0.1

In [59]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
X_test,X_val,y_test,y_val=train_test_split(X_test,y_test,test_size=0.5)

In [60]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape

((2835, 50), (2835,), (354, 50), (354,), (355, 50), (355,))

## MLP hyperparameter tuning with gridsearch

In [95]:
parameters = {'max_iter': [1000,1500,2000,2500,3000], 'alpha': 10.0 ** -np.arange(1, 10,step=5), 'hidden_layer_sizes':np.arange(10, 100,step=5), 'random_state':[0,5,9]}


clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)
clf.fit(X_train,y_train)



In [96]:
print(clf.best_params_)
print(clf.best_score_)

{'alpha': 0.1, 'hidden_layer_sizes': 10, 'max_iter': 1000, 'random_state': 9}
0.5477954144620811


In [97]:
clf.score(X_test,y_test)

0.556497175141243

In [98]:
clf.score(X_val,y_val)

0.5267605633802817