In [None]:
#Importing Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
dataset = pd.read_csv('Dataset.csv')

In [None]:
print('Shape of the dataset: ' + str(dataset.shape))
print(dataset.head())

Shape of the dataset: (380, 17)
   FTHG  FTAG  HTHG  HTAG  HS  AS  HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  \
0     3     0     2     0  23  12   11    2  15  15  16   7   1   2   0   0   
1     1     0     1     0   7  17    2   12  19  14   1   3   2   1   0   0   
2     0     0     0     0  13  12    9    7  12  13   4   8   1   3   0   0   
3     6     0     2     0  18  10   13    4  10  10   3   1   1   0   0   0   
4     2     2     1     0   6  13    2    7  13  10   3   6   3   3   1   0   

  FTR  
0   H  
1   H  
2   D  
3   H  
4   D  


In [None]:
#Creating the dependent variable class
factor = pd.factorize(dataset['FTR'])
dataset.FTR = factor[0]
definitions = factor[1]
print(dataset.FTR.head())
print(definitions)

0    0
1    0
2    1
3    0
4    1
Name: FTR, dtype: int64
Index(['H', 'D', 'A'], dtype='object')


In [None]:
#Splitting the data into independent and dependent variables
X = dataset.iloc[:,0:16].values
y = dataset.iloc[:,16].values
print('The independent features set: ')
print(X[:17,:])
print('The dependent variable: ')
print(y[:17])

The independent features set: 
[[ 3  0  2  0 23 12 11  2 15 15 16  7  1  2  0  0]
 [ 1  0  1  0  7 17  2 12 19 14  1  3  2  1  0  0]
 [ 0  0  0  0 13 12  9  7 12 13  4  8  1  3  0  0]
 [ 6  0  2  0 18 10 13  4 10 10  3  1  1  0  0  0]
 [ 2  2  1  0  6 13  2  7 13 10  3  6  3  3  1  0]
 [ 0  0  0  0 22 11 18  7 13 16 10  3  0  2  0  0]
 [ 0  4  0  3 11  9  6  7  8 11  6  4  1  1  0  0]
 [ 2  1  2  0 13 10  7  6 17 13  5  5  0  2  0  0]
 [ 1  1  0  0  7 14  4  7 13 15  9 11  1  3  1  1]
 [ 3  0  2  0 18  7 10  3  9  5  5  3  2  2  0  0]
 [ 6  0  3  0 26  3 16  1  9  3  8  2  0  0  0  1]
 [ 2  1  0  0 10  7  7  2 13 14  4 12  2  3  0  0]
 [ 1  1  1  0 14  9  6  4 18 15  3  3  1  2  0  0]
 [ 1  2  1  2 15  7 10  6 15  4  6  2  3  1  0  0]
 [ 1  0  0  0 11 10  5  4 16 12  4  6  2  2  0  0]
 [ 1  3  0  0 17 12 10  8 11 18  8  5  2  4  0  0]
 [ 0  6  0  1 17 10 12  8 12  7  0  0  1  2  0  0]]
The dependent variable: 
[0 0 1 0 1 1 2 0 1 0 0 0 1 2 0 2 2]


In [None]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=42)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to H, D and A
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)

In [None]:
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Output'], colnames=['Predicted Output']))

Predicted Output   A   D   H
Actual Output               
A                 25   3   0
D                  0  23   4
H                  0   0  40
