In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sklearn 
from sklearn.metrics import classification_report, confusion_matrix

In [0]:
spark = SparkSession.builder.appName('CCDefault').getOrCreate()

In [0]:
#Part 1 - Data Preprocessing
df = spark.read.csv('/FileStore/tables/UCI_Credit_Card.csv',inferSchema=True,header=True)

df.printSchema()

In [0]:
#1.1 Cleaning Null Values
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show() #no null values

In [0]:
#1.2 Creating Features Column

assembler = VectorAssembler(inputCols=df.columns[1:-1],
                           outputCol='features') 
#so we're grabbing columns 1 to 2nd last one which are our features. Last column is our label. Column 0 is ID which is not relevant

output = assembler.transform(df) #applying our vector assembler to all our data

output.select('features').head(1) #notice features is a DenseVector containing all the features we combined

In [0]:
#1.2 Creating Features and Label Dataframe
output.groupBy('default_payment').count().show() #so we got 2 categories for the label

In [0]:
df_final = output.select('features', 'default_payment') #so the data we'll be using will be the features and the label

df_final.show(3)

In [0]:
#1.5 Train Test Split
train, test = df_final.randomSplit([0.75,0.25]) #train will have 80%, test will have 20% of data

train.describe().show()

In [0]:
test.describe().show()

In [0]:
#Part 2 - Creating our Model
#2.1 Logistc Regression Model
logreg = LogisticRegression(featuresCol='features', labelCol='default_payment', predictionCol='prediction', maxIter=200)

logreg_model = logreg.fit(train) #fitting our train data

In [0]:
pred_logreg = logreg_model.evaluate(test)

y_true_logreg = pred_logreg.predictions.select(['default_payment']).collect()
y_pred_logreg = pred_logreg.predictions.select(['prediction']).collect()

print(classification_report(y_true_logreg, y_pred_logreg))

In [0]:
print(confusion_matrix(y_true_logreg, y_pred_logreg))

In [0]:
#2.2 Random Forest Model
rf = RandomForestClassifier(labelCol="default_payment", featuresCol="features", numTrees=10)

rf_model = rf.fit(train)

In [0]:
pred_rf = rf_model.evaluate(test)

y_true_rf = pred_rf.predictions.select(['default_payment']).collect()
y_pred_rf = pred_rf.predictions.select(['prediction']).collect()

print(classification_report(y_true_rf, y_pred_rf))

In [0]:
print(confusion_matrix(y_true_rf, y_pred_rf))

In [0]:
rf_model.featureImportances #looks like index 5 (Column PAY_0) was the largest determining factor

In [0]:
#2.3 Decision Tree Model
dt = DecisionTreeClassifier(labelCol="default_payment", featuresCol="features")

dt_model = dt.fit(train)

In [0]:
pred_dt = dt_model.transform(test)

evaluator = MulticlassClassificationEvaluator(
    labelCol="default_payment", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(pred_dt)
print("Test Error = %g " % (1.0 - accuracy))

In [0]:
dt_model.featureImportances #looks like index 5 (Column PAY_0) was the largest determining factor once again