In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
df = pd.read_excel('wc_final_dataset.xlsx')

In [3]:
df.columns

Index(['Team1', 'Team2', 'Winner', 'Margin', 'Ground', 'Match Date',
       'T-20 Int Match', 'Team1 Avg Batting Ranking',
       'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking',
       'Team2 Avg Bowling Ranking', 'Team1 Total WCs participated',
       'Team1 Total WCs won', 'Team2 Total WCs participated',
       'Team2 Total WCs won', 'Team1 win % over Team2'],
      dtype='object')

In [4]:
df = df.drop(columns=['Margin', 'Match Date', 'Ground', 'T-20 Int Match'])

In [5]:
# Scaling the feature - Team1 win % over Team2

df['Team1 win % over Team2'] = df['Team1 win % over Team2'] / 100

In [6]:
# Output feature

df['Winner'] = df.pop('Winner')

In [7]:
df

Unnamed: 0,Team1,Team2,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner
0,India,Pakistan,67.13,65.87,80.67,63.40,0,0,0,0,1.0000,India
1,Australia,India,66.54,70.20,58.77,82.40,0,0,0,0,0.0000,India
2,New Zealand,Pakistan,66.07,67.20,68.60,66.60,0,0,0,0,0.0000,Pakistan
3,South Africa,India,69.53,75.67,76.93,86.33,0,0,0,0,0.0000,India
4,Bangladesh,Pakistan,75.53,69.67,70.60,68.33,0,0,0,0,0.0000,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...
312,Afghanistan,Uganda,85.40,101.00,86.67,101.00,6,0,0,0,0.0000,Afghanistan
313,South Africa,Sri Lanka,77.73,84.60,79.87,93.20,8,0,8,1,0.6470,South Africa
314,Namibia,Oman,100.25,101.00,99.75,101.00,2,0,2,0,0.6660,tied
315,West Indies,PNG,78.33,98.80,85.13,101.00,8,2,1,0,0.0000,West Indies


In [8]:
# Indices of Super-Eight fixtures of ICC Mens T-20 World Cup 2024

super_eight_matches_idx = [265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278]

In [9]:
# The test set comprises of Super-Eight fixtures of ICC Mens T-20 World Cup 2024, which helps us evaluate different models performance and accuracy.
# The Super-Eight fixtures are removed from training set, and a test set is made of it.

df_test = df.iloc[super_eight_matches_idx].copy()
df.drop(index=super_eight_matches_idx, inplace=True)

In [10]:
df_test.reset_index(drop=True, inplace=True)

In [11]:
df.reset_index(drop=True, inplace=True)

In [12]:
def transform_ranking(rank):
    # A transformation is applied in the ranking to take into account that higher the ranking, the better the performance. The below transformation accounts for the same.
    max_rank = 101
    return max_rank + 1 - rank

In [13]:
df['Team1 Avg Batting Ranking'] = df['Team1 Avg Batting Ranking'].apply(transform_ranking)
df['Team2 Avg Batting Ranking'] = df['Team2 Avg Batting Ranking'].apply(transform_ranking)
df['Team1 Avg Bowling Ranking'] = df['Team1 Avg Bowling Ranking'].apply(transform_ranking)
df['Team2 Avg Bowling Ranking'] = df['Team2 Avg Bowling Ranking'].apply(transform_ranking)

In [14]:
df

Unnamed: 0,Team1,Team2,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner
0,India,Pakistan,34.87,36.13,21.33,38.60,0,0,0,0,1.0000,India
1,Australia,India,35.46,31.80,43.23,19.60,0,0,0,0,0.0000,India
2,New Zealand,Pakistan,35.93,34.80,33.40,35.40,0,0,0,0,0.0000,Pakistan
3,South Africa,India,32.47,26.33,25.07,15.67,0,0,0,0,0.0000,India
4,Bangladesh,Pakistan,26.47,32.33,31.40,33.67,0,0,0,0,0.0000,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...
298,Afghanistan,Uganda,16.60,1.00,15.33,1.00,6,0,0,0,0.0000,Afghanistan
299,South Africa,Sri Lanka,24.27,17.40,22.13,8.80,8,0,8,1,0.6470,South Africa
300,Namibia,Oman,1.75,1.00,2.25,1.00,2,0,2,0,0.6660,tied
301,West Indies,PNG,23.67,3.20,16.87,1.00,8,2,1,0,0.0000,West Indies


In [15]:
df_test['Team1 Avg Batting Ranking'] = df_test['Team1 Avg Batting Ranking'].apply(transform_ranking)
df_test['Team2 Avg Batting Ranking'] = df_test['Team2 Avg Batting Ranking'].apply(transform_ranking)
df_test['Team1 Avg Bowling Ranking'] = df_test['Team1 Avg Bowling Ranking'].apply(transform_ranking)
df_test['Team2 Avg Bowling Ranking'] = df_test['Team2 Avg Bowling Ranking'].apply(transform_ranking)

In [16]:
df_test

Unnamed: 0,Team1,Team2,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner
0,India,South Africa,26.33,25.33,31.33,26.07,8,1,8,0,0.5384,India
1,England,India,26.0,26.73,28.73,31.27,8,2,8,1,0.4782,India
2,Afghanistan,South Africa,16.6,25.4,14.27,23.0,6,0,8,0,0.0,South Africa
3,Afghanistan,Bangladesh,16.93,9.0,14.67,22.47,6,0,8,0,0.5454,Afghanistan
4,Australia,India,36.67,25.33,32.47,30.67,8,1,8,1,0.3548,India
5,West Indies,South Africa,28.07,25.0,19.67,22.73,8,2,8,0,0.5,South Africa
6,England,USA,26.13,4.2,29.4,1.07,8,2,0,0,0.0,England
7,Afghanistan,Australia,16.13,37.07,14.33,32.67,6,0,8,1,0.0,Afghanistan
8,Bangladesh,India,9.47,22.93,21.73,30.07,8,0,8,1,0.076,India
9,West Indies,USA,26.53,4.13,19.2,1.27,8,2,0,0,0.0,West Indies


## Feature Scaling

In [17]:
# Feature scaling performed on 4 input metrics - Team1 Avg Batting Ranking, Team2 Avg Batting Ranking, Team1 Avg Bowling Ranking, Team2 Avg Bowling Rankin
scaler = MinMaxScaler()

In [18]:
df[['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking']] = scaler.fit_transform(df[['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking']])

In [19]:
df

Unnamed: 0,Team1,Team2,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner
0,India,Pakistan,0.917141,0.989577,0.481411,1.000000,0,0,0,0,1.0000,India
1,Australia,India,0.933117,0.867606,1.000000,0.494681,0,0,0,0,0.0000,India
2,New Zealand,Pakistan,0.945843,0.952113,0.767227,0.914894,0,0,0,0,0.0000,Pakistan
3,South Africa,India,0.852153,0.713521,0.569974,0.390160,0,0,0,0,0.0000,India
4,Bangladesh,Pakistan,0.689683,0.882535,0.719867,0.868883,0,0,0,0,0.0000,Pakistan
...,...,...,...,...,...,...,...,...,...,...,...,...
298,Afghanistan,Uganda,0.422421,0.000000,0.339332,0.000000,6,0,0,0,0.0000,Afghanistan
299,South Africa,Sri Lanka,0.630111,0.461972,0.500355,0.207447,8,0,8,1,0.6470,South Africa
300,Namibia,Oman,0.020309,0.000000,0.029600,0.000000,2,0,2,0,0.6660,tied
301,West Indies,PNG,0.613864,0.061972,0.375799,0.000000,8,2,1,0,0.0000,West Indies


In [20]:
df_test[['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking']] = scaler.transform(df_test[['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking']])

In [21]:
df_test

Unnamed: 0,Team1,Team2,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner
0,India,South Africa,0.685892,0.685352,0.71821,0.666755,8,1,8,0,0.5384,India
1,England,India,0.676956,0.724789,0.656642,0.805053,8,2,8,1,0.4782,India
2,Afghanistan,South Africa,0.422421,0.687324,0.314232,0.585106,6,0,8,0,0.0,South Africa
3,Afghanistan,Bangladesh,0.431357,0.225352,0.323704,0.571011,6,0,8,0,0.5454,Afghanistan
4,Australia,India,0.965881,0.685352,0.745205,0.789096,8,1,8,1,0.3548,India
5,West Indies,South Africa,0.733008,0.676056,0.442103,0.577926,8,2,8,0,0.5,South Africa
6,England,USA,0.680477,0.090141,0.672508,0.001862,8,2,0,0,0.0,England
7,Afghanistan,Australia,0.409694,1.016056,0.315652,0.842287,6,0,8,1,0.0,Afghanistan
8,Bangladesh,India,0.229353,0.617746,0.490883,0.773138,8,0,8,1,0.076,India
9,West Indies,USA,0.691308,0.088169,0.430973,0.007181,8,2,0,0,0.0,West Indies


## Encoding the output feature

In [22]:
# In case of No-result, tied, Match Abandoned

df['Winner_encoded'] = 0

In [23]:
df_test['Winner_encoded'] = 0

In [24]:
for index, row in df.iterrows():
  if row['Winner'] == row['Team1']:
    df.at[index, 'Winner_encoded'] = 1 # If Team1 wins, the column would be encoded as 1
  elif row['Winner'] == row['Team2']:
    df.at[index, 'Winner_encoded'] = 2 # If Team2 wins, the column would be encoded as 2

In [25]:
for index, row in df_test.iterrows():
  if row['Winner'] == row['Team1']:
    df_test.at[index, 'Winner_encoded'] = 1
  elif row['Winner'] == row['Team2']:
    df_test.at[index, 'Winner_encoded'] = 2

In [26]:
df = df.drop(columns=['Winner'])
df_test = df_test.drop(columns=['Winner'])

## Encoding the input features

In [27]:
categorical_columns = ['Team1', 'Team2']
categorical_data = df[categorical_columns]

encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(categorical_data)

df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

In [28]:
numerical_columns = ['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking', 'Team1 Total WCs participated', 'Team1 Total WCs won', 'Team2 Total WCs participated','Team2 Total WCs won','Team1 win % over Team2','Winner_encoded']
df_final_train = pd.concat([df_encoded, df[numerical_columns]], axis=1)

In [29]:
df_final_train

Unnamed: 0,Team1_Afghanistan,Team1_Australia,Team1_Bangladesh,Team1_Canada,Team1_England,Team1_Hong Kong,Team1_India,Team1_Ireland,Team1_Kenya,Team1_Namibia,...,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner_encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.917141,0.989577,0.481411,1.000000,0,0,0,0,1.0000,1
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.933117,0.867606,1.000000,0.494681,0,0,0,0,0.0000,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.945843,0.952113,0.767227,0.914894,0,0,0,0,0.0000,2
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.852153,0.713521,0.569974,0.390160,0,0,0,0,0.0000,2
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.689683,0.882535,0.719867,0.868883,0,0,0,0,0.0000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.422421,0.000000,0.339332,0.000000,6,0,0,0,0.0000,1
299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630111,0.461972,0.500355,0.207447,8,0,8,1,0.6470,1
300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.020309,0.000000,0.029600,0.000000,2,0,2,0,0.6660,0
301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.613864,0.061972,0.375799,0.000000,8,2,1,0,0.0000,1


In [30]:
df_final_train.columns

Index(['Team1_Afghanistan', 'Team1_Australia', 'Team1_Bangladesh',
       'Team1_Canada', 'Team1_England', 'Team1_Hong Kong', 'Team1_India',
       'Team1_Ireland', 'Team1_Kenya', 'Team1_Namibia', 'Team1_Nepal',
       'Team1_Netherlands', 'Team1_New Zealand', 'Team1_Oman', 'Team1_PNG',
       'Team1_Pakistan', 'Team1_Scotland', 'Team1_South Africa',
       'Team1_Sri Lanka', 'Team1_UAE', 'Team1_USA', 'Team1_West Indies',
       'Team2_Afghanistan', 'Team2_Australia', 'Team2_Bangladesh',
       'Team2_Canada', 'Team2_England', 'Team2_Hong Kong', 'Team2_India',
       'Team2_Ireland', 'Team2_Namibia', 'Team2_Nepal', 'Team2_Netherlands',
       'Team2_New Zealand', 'Team2_Oman', 'Team2_PNG', 'Team2_Pakistan',
       'Team2_Scotland', 'Team2_South Africa', 'Team2_Sri Lanka', 'Team2_UAE',
       'Team2_USA', 'Team2_Uganda', 'Team2_West Indies', 'Team2_Zimbabwe',
       'Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking',
       'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking',

In [31]:
categorical_test_data = df_test[categorical_columns]

encoded_data = encoder.transform(categorical_test_data)

df_encoded_test = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))

In [32]:
numerical_columns = ['Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking', 'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking', 'Team1 Total WCs participated', 'Team1 Total WCs won', 'Team2 Total WCs participated','Team2 Total WCs won','Team1 win % over Team2','Winner_encoded']
df_final_test = pd.concat([df_encoded_test, df_test[numerical_columns]], axis=1)

In [33]:
df_final_test

Unnamed: 0,Team1_Afghanistan,Team1_Australia,Team1_Bangladesh,Team1_Canada,Team1_England,Team1_Hong Kong,Team1_India,Team1_Ireland,Team1_Kenya,Team1_Namibia,...,Team1 Avg Batting Ranking,Team2 Avg Batting Ranking,Team1 Avg Bowling Ranking,Team2 Avg Bowling Ranking,Team1 Total WCs participated,Team1 Total WCs won,Team2 Total WCs participated,Team2 Total WCs won,Team1 win % over Team2,Winner_encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.685892,0.685352,0.71821,0.666755,8,1,8,0,0.5384,1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.676956,0.724789,0.656642,0.805053,8,2,8,1,0.4782,2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.422421,0.687324,0.314232,0.585106,6,0,8,0,0.0,2
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.431357,0.225352,0.323704,0.571011,6,0,8,0,0.5454,1
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.965881,0.685352,0.745205,0.789096,8,1,8,1,0.3548,2
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.733008,0.676056,0.442103,0.577926,8,2,8,0,0.5,2
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.680477,0.090141,0.672508,0.001862,8,2,0,0,0.0,1
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.409694,1.016056,0.315652,0.842287,6,0,8,1,0.0,1
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.229353,0.617746,0.490883,0.773138,8,0,8,1,0.076,2
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.691308,0.088169,0.430973,0.007181,8,2,0,0,0.0,1


In [34]:
df_final_test.columns

Index(['Team1_Afghanistan', 'Team1_Australia', 'Team1_Bangladesh',
       'Team1_Canada', 'Team1_England', 'Team1_Hong Kong', 'Team1_India',
       'Team1_Ireland', 'Team1_Kenya', 'Team1_Namibia', 'Team1_Nepal',
       'Team1_Netherlands', 'Team1_New Zealand', 'Team1_Oman', 'Team1_PNG',
       'Team1_Pakistan', 'Team1_Scotland', 'Team1_South Africa',
       'Team1_Sri Lanka', 'Team1_UAE', 'Team1_USA', 'Team1_West Indies',
       'Team2_Afghanistan', 'Team2_Australia', 'Team2_Bangladesh',
       'Team2_Canada', 'Team2_England', 'Team2_Hong Kong', 'Team2_India',
       'Team2_Ireland', 'Team2_Namibia', 'Team2_Nepal', 'Team2_Netherlands',
       'Team2_New Zealand', 'Team2_Oman', 'Team2_PNG', 'Team2_Pakistan',
       'Team2_Scotland', 'Team2_South Africa', 'Team2_Sri Lanka', 'Team2_UAE',
       'Team2_USA', 'Team2_Uganda', 'Team2_West Indies', 'Team2_Zimbabwe',
       'Team1 Avg Batting Ranking', 'Team2 Avg Batting Ranking',
       'Team1 Avg Bowling Ranking', 'Team2 Avg Bowling Ranking',

## Building various Machine Learning Models

In [35]:
X_train = df_final_train.iloc[:, :-1].values
y_train = df_final_train.iloc[:, -1:].values

In [36]:
X_test = df_final_test.iloc[:, :-1].values
y_test = df_final_test.iloc[:, -1:].values

In [37]:
df_resultant = pd.DataFrame(columns=["Model", "Training Accuracy", "Testing Accuracy", "F1_Score", "Precision", "Recall"])

In [38]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall


## Logistic Regression

In [39]:
logistic_reg_classifier = LogisticRegression(random_state = 0,solver='lbfgs', max_iter=10000)
logistic_reg_classifier.fit(X_train, y_train.ravel())

In [40]:
training_accuracy = round(logistic_reg_classifier.score(X_train, y_train), 2)

In [41]:
y_pred = logistic_reg_classifier.predict(X_test)

In [42]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [43]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [44]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [45]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [46]:
df_resultant.loc[len(df_resultant)] = ['Logistic Regression', training_accuracy, testing_accuracy, f1, precision, recall]

In [47]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73


## Random Forest

In [48]:
random_forest_clf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0).fit(X_train, y_train.ravel())

In [49]:
training_accuracy = round(random_forest_clf.score(X_train, y_train), 2)

In [50]:
y_pred = random_forest_clf.predict(X_test)

In [51]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [52]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [53]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [54]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [55]:
df_resultant.loc[len(df_resultant)] = ['Random Forest Classifier', training_accuracy, testing_accuracy, f1, precision, recall]

In [56]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73
1,Random Forest Classifier,1.0,0.57,0.56,0.62,0.6


## SVM

In [57]:
svm_classifier = SVC(kernel = 'linear', random_state = 0).fit(X_train, y_train.ravel())

In [58]:
training_accuracy = round(svm_classifier.score(X_train, y_train), 2)

In [59]:
y_pred = svm_classifier.predict(X_test)

In [60]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [61]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [62]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [63]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [64]:
df_resultant.loc[len(df_resultant)] = ['SVM', training_accuracy, testing_accuracy, f1, precision, recall]

In [65]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73
1,Random Forest Classifier,1.0,0.57,0.56,0.62,0.6
2,SVM,0.73,0.57,0.56,0.62,0.6


## Naive Bayes

In [66]:
gausian_nb = GaussianNB().fit(X_train, y_train.ravel())

In [67]:
training_accuracy = round(gausian_nb.score(X_train, y_train), 2)

In [68]:
y_pred = gausian_nb.predict(X_test)

In [69]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [70]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [71]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [72]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [73]:
df_resultant.loc[len(df_resultant)] = ['Naive Bayes', training_accuracy, testing_accuracy, f1, precision, recall]

In [74]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73
1,Random Forest Classifier,1.0,0.57,0.56,0.62,0.6
2,SVM,0.73,0.57,0.56,0.62,0.6
3,Naive Bayes,0.52,0.57,0.56,0.62,0.6


## Kervel SVM

In [75]:
svm_classifier = SVC(kernel = 'rbf', random_state = 0).fit(X_train, y_train.ravel())

In [76]:
training_accuracy = round(svm_classifier.score(X_train, y_train), 2)

In [77]:
y_pred = svm_classifier.predict(X_test)

In [78]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [79]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [80]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [81]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [82]:
df_resultant.loc[len(df_resultant)] = ['Kernel SVM', training_accuracy, testing_accuracy, f1, precision, recall]

In [83]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73
1,Random Forest Classifier,1.0,0.57,0.56,0.62,0.6
2,SVM,0.73,0.57,0.56,0.62,0.6
3,Naive Bayes,0.52,0.57,0.56,0.62,0.6
4,Kernel SVM,0.69,0.43,0.42,0.45,0.46


## K-Nearest Neighbours

In [84]:
knn_classifier = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2).fit(X_train, y_train.ravel())

In [85]:
training_accuracy = round(knn_classifier.score(X_train, y_train), 2)

In [86]:
y_pred = knn_classifier.predict(X_test)

In [87]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [88]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [89]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [90]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [91]:
df_resultant.loc[len(df_resultant)] = ['K-Nearest Neighbours', training_accuracy, testing_accuracy, f1, precision, recall]

## Decision Tree Classifier

In [92]:
decision_tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0).fit(X_train, y_train)

In [93]:
training_accuracy = round(decision_tree_classifier.score(X_train, y_train), 2)

In [94]:
y_pred = decision_tree_classifier.predict(X_test)

In [95]:
testing_accuracy = round(accuracy_score(y_test, y_pred), 2)

In [96]:
f1 = round(f1_score(y_test, y_pred, average='macro'), 2)

In [97]:
precision = round(precision_score(y_test, y_pred, average='macro'), 2)

In [98]:
recall = round(recall_score(y_test, y_pred, average='macro'), 2)

In [99]:
df_resultant.loc[len(df_resultant)] = ['Decision Tree Classifier', training_accuracy, testing_accuracy, f1, precision, recall]

In [100]:
df_resultant

Unnamed: 0,Model,Training Accuracy,Testing Accuracy,F1_Score,Precision,Recall
0,Logistic Regression,0.75,0.71,0.71,0.73,0.73
1,Random Forest Classifier,1.0,0.57,0.56,0.62,0.6
2,SVM,0.73,0.57,0.56,0.62,0.6
3,Naive Bayes,0.52,0.57,0.56,0.62,0.6
4,Kernel SVM,0.69,0.43,0.42,0.45,0.46
5,K-Nearest Neighbours,0.7,0.5,0.5,0.52,0.52
6,Decision Tree Classifier,1.0,0.36,0.33,0.35,0.4
