In [1]:
#Create Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Credit Card Fraud Detection").getOrCreate()
spark

In [2]:
#Import needed libraries
from pyspark.sql.functions import *
from pyspark.ml.classification import  RandomForestClassifier
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, VectorSlicer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit


In [3]:
#Read credit card csv file
credit_df = spark.read.format("csv").option("inferSchema", "true").option("header","true").load("/FileStore/tables/creditcard.csv")
type(credit_df)

In [4]:
#Display data to know features and traget label
display(credit_df.select('*').limit(5))

Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.3598071336738,-0.0727811733098497,2.53634673796914,1.37815522427443,-0.338320769942518,0.462387777762292,0.239598554061257,0.0986979012610507,0.363786969611213,0.0907941719789316,-0.551599533260813,-0.617800855762348,-0.991389847235408,-0.311169353699879,1.46817697209427,-0.470400525259478,0.207971241929242,0.0257905801985591,0.403992960255733,0.251412098239705,-0.018306777944153,0.277837575558899,-0.110473910188767,0.0669280749146731,0.128539358273528,-0.189114843888824,0.133558376740387,-0.0210530534538215,149.62,0
0,1.19185711131486,0.26615071205963,0.16648011335321,0.448154078460911,0.0600176492822243,-0.0823608088155687,-0.0788029833323113,0.0851016549148104,-0.255425128109186,-0.166974414004614,1.61272666105479,1.06523531137287,0.48909501589608,-0.143772296441519,0.635558093258208,0.463917041022171,-0.114804663102346,-0.183361270123994,-0.145783041325259,-0.0690831352230203,-0.225775248033138,-0.638671952771851,0.101288021253234,-0.339846475529127,0.167170404418143,0.125894532368176,-0.0089830991432281,0.0147241691924927,2.69,0
1,-1.35835406159823,-1.34016307473609,1.77320934263119,0.379779593034328,-0.503198133318193,1.80049938079263,0.791460956450422,0.247675786588991,-1.51465432260583,0.207642865216696,0.624501459424895,0.066083685268831,0.717292731410831,-0.165945922763554,2.34586494901581,-2.89008319444231,1.10996937869599,-0.121359313195888,-2.26185709530414,0.524979725224404,0.247998153469754,0.771679401917229,0.909412262347719,-0.689280956490685,-0.327641833735251,-0.139096571514147,-0.0553527940384261,-0.0597518405929204,378.66,0
1,-0.966271711572087,-0.185226008082898,1.79299333957872,-0.863291275036453,-0.0103088796030823,1.24720316752486,0.23760893977178,0.377435874652262,-1.38702406270197,-0.0549519224713749,-0.226487263835401,0.178228225877303,0.507756869957169,-0.28792374549456,-0.631418117709045,-1.0596472454325,-0.684092786345479,1.96577500349538,-1.2326219700892,-0.208037781160366,-0.108300452035545,0.0052735967825345,-0.190320518742841,-1.17557533186321,0.647376034602038,-0.221928844458407,0.0627228487293033,0.0614576285006353,123.5,0
2,-1.15823309349523,0.877736754848451,1.548717846511,0.403033933955121,-0.407193377311653,0.0959214624684256,0.592940745385545,-0.270532677192282,0.817739308235294,0.753074431976354,-0.822842877946363,0.53819555014995,1.3458515932154,-1.11966983471731,0.175121130008994,-0.451449182813529,-0.237033239362776,-0.0381947870352842,0.803486924960175,0.408542360392758,-0.0094306971323291,0.79827849458971,-0.137458079619063,0.141266983824769,-0.206009587619756,0.502292224181569,0.219422229513348,0.215153147499206,69.99,0


In [5]:
#Check number of fraud vs number of none fraud records
classFreq = credit_df.groupBy("Class").count()
classFreq.show()

In [6]:
#Convert the data to pandas 
import pandas as pd
data_pd = credit_df.toPandas()
data = data_pd.sample(frac=1)

In [7]:
#Plot Imbalanced dataset
import seaborn as sns
from matplotlib import pyplot as plt

sns.countplot(x='Class', data=data)
plt.title('Imbalanced Distubited Classes', fontsize=14)
display(plt.show())

In [8]:
from pyspark.sql.window import Window

#dfff = spark.createDataFrame(new_df)
win = Window().orderBy('Time')
dfff = credit_df.withColumn("idx", row_number().over(win))

In [9]:
#Machine learning model Gradient-Boosted Trees (GBTs) 
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import DenseVector

training_df = dfff.rdd.map(lambda x: (DenseVector(x[0:29]),x[30],x[31])) # Dense Vector required in spark to train the data
training_df = spark.createDataFrame(training_df,["features","label","index"])
training_df = training_df.select("index","features","label")

#Split Dataset to train and test
train_data_before, test_data_before = training_df.randomSplit([.8,.2],seed=1234)

In [10]:
#count the training data
train_data_before.groupBy("label").count().show()


In [11]:
#Count testing data
test_data_before.groupBy("label").count().show()

In [12]:
#Train and predict using Gradient-Boosted Trees (GBTs) Classifier (First Classifier before handling imbalanced dataset)
gbt_before = GBTClassifier(featuresCol="features", maxIter=100,maxDepth=8)
model_before = gbt_before.fit(train_data_before)
predictions_gbt_before = model_before.transform(test_data_before)
predictions_gbt_before.groupBy("prediction").count().show()

In [13]:
#We use the BinaryClassificationEvaluator to evaluate our models, which uses areaUnderROC as the default metric.
evaluator_gbt_before = BinaryClassificationEvaluator()
evaluator_gbt_before.evaluate(predictions_gbt_before)

In [14]:
#Check count of fraud and none fraud predictions
predictions_gbt_before = predictions_gbt_before.withColumn("fraudPrediction",when((predictions_gbt_before.label==1)&(predictions_gbt_before.prediction==1),1).otherwise(0))
predictions_gbt_before.groupBy("fraudPrediction").count().show()

In [15]:
#Calculated Recall
from pyspark.sql.functions import col
accurateFraud_gbt_before = predictions_gbt_before.groupBy("fraudPrediction").count().where(predictions_gbt_before.fraudPrediction==1).head()[1]
totalFraud_gbt_before = predictions_gbt_before.groupBy("label").count().where(predictions_gbt_before.label==1).head()[1]
FraudPredictionAccuracy_gbt_before = (accurateFraud_gbt_before/totalFraud_gbt_before)*100
print(FraudPredictionAccuracy_gbt_before)

In [16]:
#Calculating Confusion matrix
tp_gbt_before = predictions_gbt_before[(predictions_gbt_before.label == 1) & (predictions_gbt_before.prediction == 1)].count()
tn_gbt_before = predictions_gbt_before[(predictions_gbt_before.label == 0) & (predictions_gbt_before.prediction == 0)].count()
fp_gbt_before = predictions_gbt_before[(predictions_gbt_before.label == 0) & (predictions_gbt_before.prediction == 1)].count()
fn_gbt_before = predictions_gbt_before[(predictions_gbt_before.label == 1) & (predictions_gbt_before.prediction == 0)].count()


print("True Positive: ",tp_gbt_before,"\nTrue Negative: ",tn_gbt_before,"\nFalse Positive: ",fp_gbt_before,"\nFalse Negative: ",fn_gbt_before)
print("Recall: ",tp_gbt_before/(tp_gbt_before+fn_gbt_before))
print("Precision: ", tp_gbt_before/(tp_gbt_before+fp_gbt_before))

In [17]:
##Train and predict using Random Forest Classifier Classifier (Second Classifier before handling imbalanced dataset)
from pyspark.ml.classification import RandomForestClassifier
rf_before = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel_before = rf_before.fit(train_data_before)
predictions_rf_before = rfModel_before.transform(test_data_before)
predictions_rf_before.groupBy("prediction").count().show()

In [18]:
#We use the BinaryClassificationEvaluator to evaluate our models, which uses areaUnderROC as the default metric.
evaluator_rf_before = BinaryClassificationEvaluator()
evaluator_rf_before.evaluate(predictions_rf_before)

In [19]:
#Check count of fraud and none fraud predictions
predictions_rf_before = predictions_rf_before.withColumn("fraudPrediction",when((predictions_rf_before.label==1)&(predictions_rf_before.prediction==1),1).otherwise(0))
predictions_rf_before.groupBy("fraudPrediction").count().show()

In [20]:
#Calculated Recall
from pyspark.sql.functions import col
accurateFraud_rf_before = predictions_rf_before.groupBy("fraudPrediction").count().where(predictions_rf_before.fraudPrediction==1).head()[1]
totalFraud_rf_before = predictions_rf_before.groupBy("label").count().where(predictions_rf_before.label==1).head()[1]
FraudPredictionAccuracy_rf_before = (accurateFraud_rf_before/totalFraud_rf_before)*100
print(FraudPredictionAccuracy_rf_before)

In [21]:
#Calculating Confusion matrix
tp_rf_before = predictions_rf_before[(predictions_rf_before.label == 1) & (predictions_rf_before.prediction == 1)].count()
tn_rf_before = predictions_rf_before[(predictions_rf_before.label == 0) & (predictions_rf_before.prediction == 0)].count()
fp_rf_before = predictions_rf_before[(predictions_rf_before.label == 0) & (predictions_rf_before.prediction == 1)].count()
fn_rf_before = predictions_rf_before[(predictions_rf_before.label == 1) & (predictions_rf_before.prediction == 0)].count()


print("True Positive: ",tp_rf_before,"\nTrue Negative: ",tn_rf_before,"\nFalse Positive: ",fp_rf_before,"\nFalse Negative: ",fn_rf_before)
print("Recall: ",tp_rf_before/(tp_rf_before+fn_rf_before))
print("Precision: ", tp_rf_before/(tp_rf_before+fp_rf_before))

In [22]:
#Check Imbalanced dataset counts
import seaborn as sns
from matplotlib import pyplot as plt

print("Distribution of Classes in subsample dataset")
print(data['Class'].value_counts())
print(len(data))


In [23]:
#Down Sampling technique to handle imbalanced dataset
#Take sample to reduce datasets to be the same (means take 492 sample from each Class) to escape from imbalance datasets problem
#import pandas as pd
#data_pd = credit_df.toPandas()
#data = data_pd.sample(frac=1)

#take same records from class 0 as class one 492 records
fraud_df = data.loc[data['Class'] == 1]
non_fraud_df = data.loc[data['Class'] == 0][:492]

normal_distribution_df = pd.concat([fraud_df, non_fraud_df])

#Shuffle dataframe rows
new_df = normal_distribution_df.sample(frac=1, random_state=42)
new_df.shape

In [24]:
#Plot to show data balanced after downsampling to handle Imbalaced dataset

print("Distribution of Classes in subsample dataset")
print(new_df['Class'].value_counts())
print(len(new_df))


In [25]:
#Plot dataset after Under Sampling
sns.countplot(x='Class', data=new_df)
plt.title('Sampled Equally Distubited Classes', fontsize=14)
display(plt.show())


In [26]:
#Check feature importance that affect model
f, (ax1,ax2) = plt.subplots(2,1, figsize=(24,20))

corr = data.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced dataset Correlation Matrix", fontsize=14)

sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('Subsample Correlation Matrix', fontsize=14)
display(plt.show())


In [27]:
#Check outliers using Boxplot
f, axes = plt.subplots(ncols=4, figsize=(20,4))

#Negative Correlations with our Class (lower our feature value more likely it will fraud transaction)
sns.boxplot(x="Class", y="V17", data=new_df, ax=axes[0])
axes[0].set_title('V17 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V14", data=new_df, ax=axes[1])
axes[1].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=new_df, ax=axes[2])
axes[2].set_title('V12 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=new_df, ax=axes[3])
axes[3].set_title('V10 vs Class Negative Correlation')
display(plt.show())

#----------------------------------------------------------------------------------------


f, axes = plt.subplots(ncols=4, figsize=(20,4))
#Positive Correlations with our Class (the higher the feature the probability increases that it will fraud transaction)

sns.boxplot(x="Class", y="V11", data=new_df, ax=axes[0])
axes[0].set_title('V11 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V4", data=new_df, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V2", data=new_df, ax=axes[2])
axes[2].set_title('V2 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V19", data=new_df, ax=axes[3])
axes[3].set_title('V19 vs Class Positive Correlation')
display(plt.show())

In [28]:
from scipy.stats import norm
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20,6))

v14_fraud_dist = new_df['V14'].loc[new_df['Class'] == 1].values
sns.distplot(v14_fraud_dist, ax=ax1, fit=norm, color='#FB8861')
ax1.set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)

v12_fraud_dist = new_df['V12'].loc[new_df['Class'] == 1].values
sns.distplot(v12_fraud_dist, ax=ax2, fit=norm, color='#FB8861')
ax2.set_title('12 Distribution \n (Fraud Transactions)', fontsize=14)

v10_fraud_dist = new_df['V10'].loc[new_df['Class'] == 1].values
sns.distplot(v10_fraud_dist, ax=ax3, fit=norm, color='#FB8861')
ax3.set_title('V10 Distribution \n (Fraud Transactions)', fontsize=14)

display(plt.show())

In [29]:
#Removing Outliers
import numpy as np

#V14 Removing Outliers (Highest Negative Correlated with Labels)
v14_fraud = new_df['V14'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
v14_iqr = q75 - q25
print('iqr:{}'.format(v14_iqr))

v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
print('Cut Off: {}'.format(v14_cut_off)) 
print('V14 Lower: {}'.format(v14_lower))
print('V14 Upper: {}'.format(v14_upper))

outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
print('Feature V14 Outliers for Fraud Cases {}'.format(len(outliers)))
print('V14 outliers: {}'.format(outliers))

new_df = new_df.drop(new_df[(new_df['V14'] > v14_upper) | (new_df['V14'] < v14_lower)].index)
print('Number of Instances after outliers removal {}'.format(len(new_df)))
print('---' * 44)


#V12 Removing Outliers (Highest Negative Correlated with Labels)
V12_fraud = new_df['V12'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(V12_fraud, 25), np.percentile(V12_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
V12_iqr = q75 - q25
print('iqr:{}'.format(V12_iqr))

V12_cut_off = V12_iqr * 1.5
V12_lower, V12_upper = q25 - V12_cut_off, q75 + V12_cut_off
print('Cut Off: {}'.format(V12_cut_off)) 
print('V12 Lower: {}'.format(V12_lower))
print('V12 Upper: {}'.format(V12_upper))

outliers = [x for x in V12_fraud if x < V12_lower or x > V12_upper]
print('Feature V12 Outliers for Fraud Cases {}'.format(len(outliers)))
print('V12 outliers: {}'.format(outliers))

new_df = new_df.drop(new_df[(new_df['V12'] > V12_upper) | (new_df['V12'] < V12_lower)].index)
print('Number of Instances after outliers removal {}'.format(len(new_df)))
print('---' * 44)

#V10 Removing Outliers (Highest Negative Correlated with Labels)
V10_fraud = new_df['V10'].loc[new_df['Class'] == 1].values
q25, q75 = np.percentile(V10_fraud, 25), np.percentile(V10_fraud, 75)
print('Quartile 25: {} | Quartile 75: {}'.format(q25, q75))
V10_iqr = q75 - q25
print('iqr:{}'.format(V10_iqr))

V10_cut_off = V10_iqr * 1.5
V10_lower, V10_upper = q25 - V10_cut_off, q75 + V10_cut_off
print('Cut Off: {}'.format(V10_cut_off)) 
print('V10 Lower: {}'.format(V10_lower))
print('V10 Upper: {}'.format(V10_upper))

outliers = [x for x in V10_fraud if x < V10_lower or x > V10_upper]
print('Feature V10 Outliers for Fraud Cases {}'.format(len(outliers)))
print('V10 outliers: {}'.format(outliers))

new_df = new_df.drop(new_df[(new_df['V10'] > V10_upper) | (new_df['V10'] < V10_lower)].index)

print('Number of Instances after outliers removal {}'.format(len(new_df)))

In [30]:
#lets  check outlier graphs
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20,6))

colors = ['#B3F9C5', '#f9c5b3']

#Boxplot with outliers removed

#Feature V14
sns.boxplot(x="Class", y="V14", data=new_df, ax=ax1, palette=colors)
ax1.set_title("V14 Feature \n Reduction of outliers", fontsize=14)
ax1.annotate('Fewer extreme \n ouliers', xy=(0.98, -17.5), xytext=(0, -12), arrowprops=dict(facecolor='black'), fontsize=14)

#Feature V12
sns.boxplot(x="Class", y="V12", data=new_df, ax=ax2, palette=colors)
ax2.set_title("V12 Feature \n Reduction of outliers", fontsize=14)
ax2.annotate('Fewer extreme \n ouliers', xy=(0.98, -17.5), xytext=(0, -12), arrowprops=dict(facecolor='black'), fontsize=14)

#Feature V10
sns.boxplot(x="Class", y="V10", data=new_df, ax=ax3, palette=colors)
ax3.set_title("V10 Feature \n Reduction of outliers", fontsize=14)
ax3.annotate('Fewer extreme \n ouliers', xy=(0.98, -17.5), xytext=(0, -12), arrowprops=dict(facecolor='black'), fontsize=14)

display(plt.show())

In [31]:
#Convert back from Pandas to dataframe
from pyspark.sql.functions import *
from pyspark.sql.window import Window

dfff = spark.createDataFrame(new_df)
win = Window().orderBy('Time')
dfff = dfff.withColumn("idx", row_number().over(win))

In [32]:
#Machine learning model Gradient-Boosted Trees (GBTs) 
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import DenseVector

training_df = dfff.rdd.map(lambda x: (DenseVector(x[0:29]),x[30],x[31])) # Dense Vector required in spark to train the data
training_df = spark.createDataFrame(training_df,["features","label","index"])
training_df = training_df.select("index","features","label")

#Split Dataset to train and test
train_data_after, test_data_after = training_df.randomSplit([.8,.2],seed=1234)

In [33]:
#count the training data
train_data_after.groupBy("label").count().show()


In [34]:
#Count testing data
test_data_after.groupBy("label").count().show()


In [35]:
#Train and predict using Gradient-Boosted Trees (GBTs) Classifier (First Classifier after handling imbalanced dataset)
gbt = GBTClassifier(featuresCol="features", maxIter=100,maxDepth=8)
model = gbt.fit(train_data_after)
predictions_gbt_after = model.transform(test_data_after)
predictions_gbt_after.groupBy("prediction").count().show()

In [36]:
#We use the BinaryClassificationEvaluator to evaluate our models, which uses areaUnderROC as the default metric.
evaluator_gbt_after = BinaryClassificationEvaluator()
evaluator_gbt_after.evaluate(predictions_gbt_after)

In [37]:
#Check count of fraud and none fraud predictions
predictions_gbt_after = predictions_gbt_after.withColumn("fraudPrediction",when((predictions_gbt_after.label==1)&(predictions_gbt_after.prediction==1),1).otherwise(0))
predictions_gbt_after.groupBy("fraudPrediction").count().show()

In [38]:
#Calculated Recall
from pyspark.sql.functions import col
accurateFraud_gbt_after = predictions_gbt_after.groupBy("fraudPrediction").count().where(predictions_gbt_after.fraudPrediction==1).head()[1]
totalFraud_gbt_after = predictions_gbt_after.groupBy("label").count().where(predictions_gbt_after.label==1).head()[1]
FraudPredictionAccuracy_gbt_after = (accurateFraud_gbt_after/totalFraud_gbt_after)*100
print(FraudPredictionAccuracy_gbt_after)

In [39]:
#Calculating Confusion matrix
tp_gbt_after = predictions_gbt_after[(predictions_gbt_after.label == 1) & (predictions_gbt_after.prediction == 1)].count()
tn_gbt_after = predictions_gbt_after[(predictions_gbt_after.label == 0) & (predictions_gbt_after.prediction == 0)].count()
fp_gbt_after = predictions_gbt_after[(predictions_gbt_after.label == 0) & (predictions_gbt_after.prediction == 1)].count()
fn_gbt_after = predictions_gbt_after[(predictions_gbt_after.label == 1) & (predictions_gbt_after.prediction == 0)].count()


print("True Positive: ",tp_gbt_after,"\nTrue Negative: ",tn_gbt_after,"\nFalse Positive: ",fp_gbt_after,"\nFalse Negative: ",fn_gbt_after)
print("Recall: ",tp_gbt_after/(tp_gbt_after+fn_gbt_after))
print("Precision: ", tp_gbt_after/(tp_gbt_after+fp_gbt_after))

In [40]:
##Train and predict using Random Forest Classifier Classifier (Second Classifier after handling imbalanced dataset)
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train_data_after)
predictions_rf_after = rfModel.transform(test_data_after)
predictions_rf_after.groupBy("prediction").count().show()

In [41]:
#We use the BinaryClassificationEvaluator to evaluate our models, which uses areaUnderROC as the default metric.
evaluatorrf_after = BinaryClassificationEvaluator()
evaluatorrf_after.evaluate(predictions_rf_after)

In [42]:
#Check count of fraud and none fraud predictions
predictions_rf_after = predictions_rf_after.withColumn("fraudPrediction",when((predictions_rf_after.label==1)&(predictions_rf_after.prediction==1),1).otherwise(0))
predictions_rf_after.groupBy("fraudPrediction").count().show()

In [43]:
#Calculated Recall
from pyspark.sql.functions import col
accurateFraud_rf_after = predictions_rf_after.groupBy("fraudPrediction").count().where(predictions_rf_after.fraudPrediction==1).head()[1]
totalFraud_rf_after = predictions_rf_after.groupBy("label").count().where(predictions_rf_after.label==1).head()[1]
FraudPredictionAccuracy_rf_after = (accurateFraud_rf_after/totalFraud_rf_after)*100
print(FraudPredictionAccuracy_rf_after)

In [44]:
#Calculating Confusion matrix
tp_rf_after = predictions_rf_after[(predictions_rf_after.label == 1) & (predictions_rf_after.prediction == 1)].count()
tn_rf_after = predictions_rf_after[(predictions_rf_after.label == 0) & (predictions_rf_after.prediction == 0)].count()
fp_rf_after = predictions_rf_after[(predictions_rf_after.label == 0) & (predictions_rf_after.prediction == 1)].count()
fn_rf_after = predictions_rf_after[(predictions_rf_after.label == 1) & (predictions_rf_after.prediction == 0)].count()


print("True Positive: ",tp_rf_after,"\nTrue Negative: ",tn_rf_after,"\nFalse Positive: ",fp_rf_after,"\nFalse Negative: ",fn_rf_after)
print("Recall: ",tp_rf_after/(tp_rf_after+fn_rf_after))
print("Precision: ", tp_rf_after/(tp_rf_after+fp_rf_after))

In [45]:
#Plot a comparison between the results of the evaluation method between the two classifiers before and after handling imbalance dataset
from sklearn import metrics
import numpy as np

#GBT Clasiifier plot before handling imbalanced dataset

label_array_gbt_before= np.array(test_data_before.select("label").collect())
label_array_gbt_before = label_array_gbt_before.flatten('F')
#predictions_gbt_before = np.array(predictions_gbt_before.select("prediction").collect())
print(type(predictions_gbt_before))
predictions_gbt_before = predictions_gbt_before.flatten('F')
fpr_gbt_before, tpr_gbt_before, thresh_gbt_before = metrics.roc_curve(label_array_gbt_before, predictions_gbt_before)
auc_gbt_before = metrics.roc_auc_score(label_array_rf_before, predictions_rf_before)
plt.plot(fpr_gbt_before,tpr_gbt_before,label="GBT Classifier before handle imbalance, auc="+str(auc_gbt_before))


#Random Forest plot before handling imbalanced dataset

label_array_rf_before= np.array(test_data_before.select("label").collect())
label_array_rf_before = label_array_rf_before.flatten('F')
#predictions_rf_before = np.array(predictions_rf_before.select("prediction").collect())
print(type(predictions_rf_before))
predictions_rf_before = predictions_rf_before.flatten('F')
fpr_rf_before, tpr_rf_before, thresh_rf_before = metrics.roc_curve(label_array_rf_before, predictions_rf_before)
auc_rf_before = metrics.roc_auc_score(label_array_rf_before, predictions_rf_before)
plt.plot(fpr_rf_before,tpr_rf_before,label="Random Forest before handle imbalance, auc="+str(auc_rf_before))


#GBT Clasiifier plot after handling imbalanced dataset

label_array_gbt_after= np.array(test_data_after.select("label").collect())
label_array_gbt_after = label_array_gbt_after.flatten('F')
#predictions_gbt_after = np.array(predictions_gbt_after.select("prediction").collect())
print(type(predictions_gbt_after))
predictions_gbt_after = predictions_gbt_after.flatten('F')
fpr_gbt_after, tpr_gbt_after, thresh_gbt_after = metrics.roc_curve(label_array_gbt_after, predictions_gbt_after)
auc_gbt_after = metrics.roc_auc_score(label_array_gbt_after, predictions_gbt_after)
plt.plot(fpr_gbt_after,tpr_gbt_after,label="GBT Classifier after handle imbalance, auc="+str(auc_gbt_after))

#Random Forest plot before handling imbalanced dataset

label_array_rf_after= np.array(test_data_after.select("label").collect())
label_array_rf_after = label_array_rf_after.flatten('F')
#predictions_rf_after = np.array(predictions_rf_after.select("prediction").collect())
print(type(predictions_rf_after))
predictions_rf_after = predictions_rf_after.flatten('F')
fpr_rf_after, tpr_rf_after, thresh_rf_after = metrics.roc_curve(label_array_rf_after, predictions_rf_after)
auc_rf_after = metrics.roc_auc_score(label_array_rf_after, predictions_rf_after)
plt.plot(fpr_rf_after,tpr_rf_after,label="Random Forest after handle imbalance, auc="+str(auc_rf_after))

plt.legend(loc=0)
display(plt.show())

In [46]:
# GBT Classifier accuracy and Random Forset classifiers are same around 94.5% after handling imbalanced dataset that is higher than GBT and Random Forest that are around 89.6 % 
#before handling imbalanced dataset