In [None]:
#Load the CSV file into a RDD
bankData = sc.textFile("data/bank.csv")
bankData.cache()
bankData.count()

In [None]:
#Remove the first line (contains headers)
firstLine=bankData.first()
dataLines = bankData.filter(lambda x: x != firstLine)
dataLines.count()

In [None]:
import math
from pyspark.mllib.linalg import Vectors

def transformToNumeric( inputStr) :
    
    attList=inputStr.replace("\"","").split(";")
    
    age=float(attList[0])
    #convert outcome to float    
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    #create indicator variables for single/married    
    single= 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    #create indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    #convert default to float
    default= 0.0 if attList[4] == "no" else 1.0
    #convert balance amount to float
    balance=float(attList[5])
    #convert loan to float
    loan= 0.0 if attList[7] == "no" else 1.0
    
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ outcome, age, single, married, \
                divorced, primary, secondary, tertiary,\
                default, balance, loan \
                     ])
    return values

In [None]:
#Change to a Vector
bankVectors = dataLines.map(transformToNumeric)
bankVectors.collect()[:15]

In [None]:
#Perform statistical Analysis
from pyspark.mllib.stat import Statistics
bankStats=Statistics.colStats(bankVectors)
bankStats.mean()

In [None]:
bankStats.variance()

In [None]:
bankStats.min()

In [None]:
bankStats.max()

In [None]:
Statistics.corr(bankVectors)

In [None]:
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
def transformToLabeledPoint(inStr) :
    lp = ( float(inStr[0]), \
    Vectors.dense([inStr[1],inStr[2],inStr[3], \
        inStr[4],inStr[5],inStr[6],inStr[7], \
        inStr[8],inStr[9],inStr[10]
        ]))
    return lp

In [None]:
bankLp = bankVectors.map(transformToLabeledPoint)
bankLp.collect()

In [None]:
bankDF = sqlContext.createDataFrame(bankLp,["label", "features"])
bankDF.select("label","features").show(10)

In [None]:
#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate=False)

In [None]:
#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

In [None]:
#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()

In [None]:
testData.count()

In [None]:
testData.collect()

In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures")
rmModel = rmClassifer.fit(trainingData)

In [None]:
#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction","indexed","label","pcaFeatures").collect()

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions) 

In [None]:
#Draw a confusion matrix
labelList=predictions.select("indexed","label").distinct().toPandas()
predictions.groupBy("indexed","prediction").count().show()