In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sklearn 
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
spark = SparkSession.builder.appName('CCDefault').getOrCreate()

#**Part 1 - Data Preprocessing**

In [None]:
df = spark.read.csv('/FileStore/tables/UCI_Credit_Card.csv',inferSchema=True,header=True)

df.printSchema()

**1.1 Cleaning Null Values**

In [None]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show() #no null values

**1.2 Creating Features Column**



In [None]:
assembler = VectorAssembler(inputCols=df.columns[1:-1],
                           outputCol='features') 
#so we're grabbing columns 1 to 2nd last one which are our features. Last column is our label. Column 0 is ID which is not relevant

output = assembler.transform(df) #applying our vector assembler to all our data

output.select('features').head(1) #notice features is a DenseVector containing all the features we combined

**1.3 Creating Features and Label Dataframe**

In [None]:
output.groupBy('default_payment').count().show() #so we got 2 categories for the label

In [None]:
df_final = output.select('features', 'default_payment') #so the data we'll be using will be the features and the label

df_final.show(3)

**1.4 Train Test Split**

In [None]:
train, test = df_final.randomSplit([0.75,0.25]) #train will have 80%, test will have 20% of data

train.describe().show()

In [None]:
test.describe().show()

#**Part 2 - Creating our Model**
**2.1 Logistic Regression Model**

In [None]:
logreg = LogisticRegression(featuresCol='features', labelCol='default_payment', predictionCol='prediction', maxIter=200)

logreg_model = logreg.fit(train) #fitting our train data

In [None]:
pred_logreg = logreg_model.evaluate(test)

y_true_logreg = pred_logreg.predictions.select(['default_payment']).collect()
y_pred_logreg = pred_logreg.predictions.select(['prediction']).collect()

print(classification_report(y_true_logreg, y_pred_logreg))

In [None]:
print(confusion_matrix(y_true_logreg, y_pred_logreg))

**2.2 Random Forest Model**

In [None]:
rf = RandomForestClassifier(labelCol="default_payment", featuresCol="features", numTrees=10)

rf_model = rf.fit(train)

In [None]:
pred_rf = rf_model.evaluate(test)

y_true_rf = pred_rf.predictions.select(['default_payment']).collect()
y_pred_rf = pred_rf.predictions.select(['prediction']).collect()

print(classification_report(y_true_rf, y_pred_rf))

In [None]:
print(confusion_matrix(y_true_rf, y_pred_rf))

In [None]:
rf_model.featureImportances #looks like index 5 (Column PAY_0) was the largest determining factor

**2.3 Decision Tree Model**

In [None]:
dt = DecisionTreeClassifier(labelCol="default_payment", featuresCol="features")

dt_model = dt.fit(train)

In [None]:
pred_dt = dt_model.transform(test)

evaluator = MulticlassClassificationEvaluator(
    labelCol="default_payment", predictionCol="prediction", metricName="accuracy")

accuracy = evaluator.evaluate(pred_dt)
print("Test Error = %g " % (1.0 - accuracy))

In [None]:
dt_model.featureImportances #looks like index 5 (Column PAY_0) was the largest determining factor once again