In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
csv_path = f'../../data/premier_league/team_fixtures_2019/'
data_list = []

for i in os.listdir(csv_path):
    data = pd.read_csv(csv_path + i)
    data_list.append(data)

data_df = pd.concat(data_list).reset_index(drop = True)
data_df = data_df.replace({'%' : ''}, regex = True)
data_df = data_df.fillna(0)
data_df['h_result'] = np.where(data_df['h_result'] == 2, 1, np.where(data_df['h_result'] == 1, 0, 0))

data_df.head()

Unnamed: 0.1,Unnamed: 0,fixture_id,h_result,event_date,venue,h_team,a_team,h_halftime_score,a_haftime_score,h_fulltime_score,...,h_red_cards,a_red_cards,h_goalkeeper_saves,a_goalkeeper_saves,h_total_passes,a_total_passes,h_accurate_passes,a_accurate_passes,h_pass_percentage,a_pass_percentage
0,0,157024,1,2019-08-11T15:30:00+00:00,Old Trafford,33,49,1,0,4,...,0.0,0.0,7.0,1.0,449,523,363,442,81,85
1,1,157034,0,2019-08-19T19:00:00+00:00,Molineux Stadium,39,33,0,1,1,...,0.0,0.0,1.0,1.0,350,668,263,579,75,87
2,2,157039,0,2019-08-24T14:00:00+00:00,Old Trafford,33,52,0,1,1,...,0.0,0.0,1.0,2.0,562,231,481,141,86,61
3,3,157053,0,2019-08-31T11:30:00+00:00,St. Mary's Stadium,41,33,0,1,1,...,1.0,0.0,7.0,1.0,375,537,276,427,74,80
4,4,157059,1,2019-09-14T14:00:00+00:00,Old Trafford,33,46,1,0,1,...,0.0,0.0,3.0,4.0,345,471,258,373,75,79


In [3]:
X = data_df.loc[: , 'h_shot_on_goal' : ]
X = X.astype('float')
y = data_df[['h_result']].values.reshape(-1,1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 14)

In [5]:
scaler = StandardScaler()

In [6]:
X_scaler = scaler.fit(X_train)

In [7]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
rf_model = RandomForestClassifier(n_estimators = 500, random_state = 14)

In [9]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [10]:
predictions = rf_model.predict(X_test_scaled)

In [11]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index = ['actual_0', 'actual_1'], columns = ['predicted_0', 'predicted_1'])

acc_score = accuracy_score(y_test, predictions)

In [12]:
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,predicted_0,predicted_1
actual_0,97,0
actual_1,10,83


Accuracy Score : 0.9473684210526315
Classification Report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        97
           1       1.00      0.89      0.94        93

    accuracy                           0.95       190
   macro avg       0.95      0.95      0.95       190
weighted avg       0.95      0.95      0.95       190



In [13]:
importances = rf_model.feature_importances_

In [14]:
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.13128510158409712, 'h_shot_on_goal'),
 (0.05362871520681309, 'a_shots_inside_box'),
 (0.04456273309753901, 'a_shot_on_goal'),
 (0.04135934507588262, 'h_accurate_passes'),
 (0.04110151079097716, 'a_total_shots'),
 (0.03958981917582995, 'h_shot_off_goal'),
 (0.03763742901884969, 'h_total_passes'),
 (0.03586232255048431, 'a_goalkeeper_saves'),
 (0.035167919648324254, 'h_total_shots'),
 (0.03487393306477662, 'a_accurate_passes')]