## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import os

## Importing the dataset

In [4]:
file_dir = os.getcwd()[:-6] + "/Cleansed/df_all.csv"
dataset = pd.read_csv(file_dir)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

## Encoding Categorical Data

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

In [7]:
y = le.fit_transform(y)

## Splitting the dataset into Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training Random Forest Classifier on the Training Set

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

## Predicting the Test Data

In [8]:
y_pred = classifier.predict(X_test)

In [9]:
pred_data = pd.DataFrame({"Predicted values":y_pred, "Actual values":y_test})

In [10]:
pred_data

Unnamed: 0,Predicted values,Actual values
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
827825,0,0
827826,0,0
827827,0,0
827828,1,1


## Confusion Matrix and Accuracy Score

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score

# First row [Predict Class 0, Actual Class 0, Predict Class 0, Actual Class 1]
# Second row [Predict Class 1, Actual Class 0, Predict Class 1, Actual Class 1]
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[678736   3068]
 [  9959 136067]]


0.9842636773250547

## F1 Score

In [12]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

F1 score: 0.9543170349381577


## Using ICA

In [30]:
from sklearn.decomposition import FastICA

ica = FastICA(n_components=7)
X_ica = ica.fit_transform(X_train)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_ica, y_train)



In [31]:
y_pred = classifier.predict(ica.fit_transform(X_test))



In [32]:
pred_data = pd.DataFrame({"Predicted values":y_pred, "Actual values":y_test})

In [33]:
pred_data

Unnamed: 0,Predicted values,Actual values
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0
...,...,...
827825,0,0
827826,0,0
827827,1,0
827828,0,1


## Confusion Matrix and Accuracy Score

In [34]:
from sklearn.metrics import confusion_matrix, accuracy_score

# First row [Predict Class 0, Actual Class 0, Predict Class 0, Actual Class 1]
# Second row [Predict Class 1, Actual Class 0, Predict Class 1, Actual Class 1]
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[602964  78840]
 [133944  12082]]


0.7429617191935542

## Using LDA

In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=1)

# run an LDA and use it to transform the features
X_lda = lda.fit(X_train, y_train).transform(X_train)

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_lda, y_train)

In [21]:
y_pred = classifier.predict(lda.fit(X_test, y_test).transform(X_test))

In [22]:
pred_data = pd.DataFrame({"Predicted values":y_pred, "Actual values":y_test})

In [23]:
pred_data

Unnamed: 0,Predicted values,Actual values
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
827825,1,0
827826,0,0
827827,1,0
827828,0,1


In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score

# First row [Predict Class 0, Actual Class 0, Predict Class 0, Actual Class 1]
# Second row [Predict Class 1, Actual Class 0, Predict Class 1, Actual Class 1]
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[591553  90251]
 [111764  34262]]


0.7559704287112088