In [1]:
#Load the CSV file into a RDD
bankData = sc.textFile("data/bank.csv")
bankData.cache()
bankData.count()

542

In [2]:
#Remove the first line (contains headers)
firstLine=bankData.first()
dataLines = bankData.filter(lambda x: x != firstLine)
dataLines.count()

541

In [3]:
import math
from pyspark.mllib.linalg import Vectors

def transformToNumeric( inputStr) :
    
    attList=inputStr.replace("\"","").split(";")
    
    age=float(attList[0])
    #convert outcome to float    
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    #create indicator variables for single/married    
    single= 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    #create indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    #convert default to float
    default= 0.0 if attList[4] == "no" else 1.0
    #convert balance amount to float
    balance=float(attList[5])
    #convert loan to float
    loan= 0.0 if attList[7] == "no" else 1.0
    
    #Filter out columns not wanted at this stage
    values= Vectors.dense([ outcome, age, single, married, \
                divorced, primary, secondary, tertiary,\
                default, balance, loan \
                     ])
    return values

In [4]:
#Change to a Vector
bankVectors = dataLines.map(transformToNumeric)
bankVectors.collect()[:15]

[DenseVector([0.0, 30.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1787.0, 0.0]),
 DenseVector([1.0, 33.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 4789.0, 1.0]),
 DenseVector([1.0, 35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1350.0, 0.0]),
 DenseVector([1.0, 30.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1476.0, 1.0]),
 DenseVector([0.0, 59.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]),
 DenseVector([1.0, 35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 747.0, 0.0]),
 DenseVector([1.0, 36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 307.0, 0.0]),
 DenseVector([0.0, 39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 147.0, 0.0]),
 DenseVector([0.0, 41.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 221.0, 0.0]),
 DenseVector([1.0, 43.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -88.0, 1.0]),
 DenseVector([0.0, 39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9374.0, 0.0]),
 DenseVector([0.0, 43.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 264.0, 0.0]),
 DenseVector([0.0, 36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1109.0, 0.0]),
 DenseVector([1.0, 20.0, 1.0, 0.0,

In [5]:
#Perform statistical Analysis
from pyspark.mllib.stat import Statistics
bankStats=Statistics.colStats(bankVectors)
bankStats.mean()

array([  3.97412200e-01,   4.12698706e+01,   2.75415896e-01,
         6.15526802e-01,   1.09057301e-01,   1.53419593e-01,
         4.95378928e-01,   3.14232902e-01,   2.21811460e-02,
         1.44478189e+03,   1.62661738e-01])

In [6]:
bankStats.variance()

array([  2.39919217e-01,   1.11415924e+02,   1.99931540e-01,
         2.37091805e-01,   9.73437393e-02,   1.30122544e-01,
         2.50441569e-01,   2.15889642e-01,   2.17293079e-02,
         5.87224851e+06,   1.36455124e-01])

In [7]:
bankStats.min()

array([    0.,    19.,     0.,     0.,     0.,     0.,     0.,     0.,
           0., -1206.,     0.])

In [8]:
bankStats.max()

array([  1.00000000e+00,   7.80000000e+01,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.00000000e+00,   1.00000000e+00,   1.00000000e+00,
         1.68730000e+04,   1.00000000e+00])

In [9]:
Statistics.corr(bankVectors)

array([[ 1.        , -0.18232104,  0.46323285, -0.37532413, -0.0781266 ,
        -0.12561549,  0.02639277,  0.08494841, -0.04536965,  0.03657487,
        -0.03042059],
       [-0.18232104,  1.        , -0.40971334,  0.24253548,  0.208662  ,
         0.18705376, -0.1049356 , -0.08566612,  0.02589999,  0.14746211,
        -0.0108042 ],
       [ 0.46323285, -0.40971334,  1.        , -0.78008253, -0.21570121,
        -0.10171839,  0.02638786,  0.06399288, -0.03666486,  0.00224317,
         0.01977069],
       [-0.37532413,  0.24253548, -0.78008253,  1.        , -0.44268309,
         0.06232365,  0.00789467, -0.0625317 , -0.06156785, -0.00746014,
         0.02917413],
       [-0.0781266 ,  0.208662  , -0.21570121, -0.44268309,  1.        ,
         0.04851091, -0.05013811,  0.00587947,  0.14863123,  0.00842788,
        -0.07386451],
       [-0.12561549,  0.18705376, -0.10171839,  0.06232365,  0.04851091,
         1.        , -0.42178621, -0.28816671,  0.04036243, -0.01358146,
         0.048

In [10]:
#Transform to a Data Frame for input to Machine Learing
#Drop columns that are not required (low correlation)

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [11]:
def transformToLabeledPoint(inStr) :
    lp = ( float(inStr[0]), \
    Vectors.dense([inStr[1],inStr[2],inStr[3], \
        inStr[4],inStr[5],inStr[6],inStr[7], \
        inStr[8],inStr[9],inStr[10]
        ]))
    return lp

In [12]:
bankLp = bankVectors.map(transformToLabeledPoint)
bankLp.collect()

[(0.0, DenseVector([30.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1787.0, 0.0])),
 (1.0, DenseVector([33.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 4789.0, 1.0])),
 (1.0, DenseVector([35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1350.0, 0.0])),
 (1.0, DenseVector([30.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1476.0, 1.0])),
 (0.0, DenseVector([59.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0])),
 (1.0, DenseVector([35.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 747.0, 0.0])),
 (1.0, DenseVector([36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 307.0, 0.0])),
 (0.0, DenseVector([39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 147.0, 0.0])),
 (0.0, DenseVector([41.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 221.0, 0.0])),
 (1.0, DenseVector([43.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, -88.0, 1.0])),
 (0.0, DenseVector([39.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 9374.0, 0.0])),
 (0.0, DenseVector([43.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 264.0, 0.0])),
 (0.0, DenseVector([36.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1109.0, 0.0])),
 (1.0, D

In [13]:
bankDF = sqlContext.createDataFrame(bankLp,["label", "features"])
bankDF.select("label","features").show(10)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[30.0,0.0,1.0,0.0...|
|  1.0|[33.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[30.0,0.0,1.0,0.0...|
|  0.0|[59.0,0.0,1.0,0.0...|
|  1.0|[35.0,1.0,0.0,0.0...|
|  1.0|[36.0,0.0,1.0,0.0...|
|  0.0|[39.0,0.0,1.0,0.0...|
|  0.0|[41.0,0.0,1.0,0.0...|
|  1.0|[43.0,0.0,1.0,0.0...|
+-----+--------------------+
only showing top 10 rows



In [14]:
#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate=False)

+-----+--------------------------------------------------------------+
|label|pcaFeatures                                                   |
+-----+--------------------------------------------------------------+
|0.0  |[-1787.0188971973807,28.862096837754688,-0.06459982604832634] |
|1.0  |[-4789.02017713849,29.922562636340345,-0.9830243513099471]    |
|1.0  |[-1350.0222131632618,34.10110809796642,0.895142716828155]     |
|1.0  |[-1476.0189517184554,29.051333993596206,0.3952723868025552]   |
|0.0  |[-0.037889185366442445,58.989718200017684,-0.7290792383674507]|
|1.0  |[-747.0223377634921,34.48829198181747,0.9045654956949845]     |
|1.0  |[-307.0230691022592,35.79985053965512,0.5170631523787516]     |
|0.0  |[-147.02501216176339,38.90107856650324,-0.8069627548805194]   |
|0.0  |[-221.02629853487863,40.853633675694866,0.5373036365803046]   |
|1.0  |[87.9723868768871,43.06265944115107,-0.06701642871177398]     |
|0.0  |[-9374.023105550941,32.976458837989746,-0.9511484606918547]   |
|0.0  

In [15]:
#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

[Row(label=0.0, pcaFeatures=DenseVector([-1787.0189, 28.8621, -0.0646]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-4789.0202, 29.9226, -0.983]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1350.0222, 34.1011, 0.8951]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-1476.019, 29.0513, 0.3953]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-0.0379, 58.9897, -0.7291]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([-747.0223, 34.4883, 0.9046]), indexed=1.0),
 Row(label=1.0, pcaFeatures=DenseVector([-307.0231, 35.7999, 0.5171]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-147.025, 38.9011, -0.807]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-221.0263, 40.8536, 0.5373]), indexed=0.0),
 Row(label=1.0, pcaFeatures=DenseVector([87.9724, 43.0627, -0.067]), indexed=1.0),
 Row(label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9765, -0.9511]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-264.0276, 42.8248, -0

In [16]:
#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()

370

In [17]:
testData.count()

171

In [18]:
testData.collect()

[Row(label=0.0, pcaFeatures=DenseVector([-14093.0337, 47.9412, -0.9569]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9765, -0.9511]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-8104.0336, 49.7873, -0.8708]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-7190.0255, 37.3733, 0.7344]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-6313.0372, 55.9407, -0.1054]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-5181.0272, 40.6677, -0.9476]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-3571.025, 37.7029, 0.4812]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-2693.02, 30.2683, -0.8732]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-1981.0227, 34.7243, -0.847]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-1972.0214, 32.7313, 0.4835]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-1877.0356, 54.785, 0.2511]), indexed=0.0),
 Row(label=0.0, pcaFeatures=DenseVector([-1641.0215

In [19]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures")
rmModel = rmClassifer.fit(trainingData)

In [21]:
#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction","indexed","label","pcaFeatures").collect()

[Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-14093.0337, 47.9412, -0.9569])),
 Row(prediction=1.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-9374.0231, 32.9765, -0.9511])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-8104.0336, 49.7873, -0.8708])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-7190.0255, 37.3733, 0.7344])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-6313.0372, 55.9407, -0.1054])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-5181.0272, 40.6677, -0.9476])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3571.025, 37.7029, 0.4812])),
 Row(prediction=1.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-2693.02, 30.2683, -0.8732])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1981.0227, 34.7243, -0.847])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1972.02

In [22]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="precision")
evaluator.evaluate(predictions) 

0.6666666666666666

In [23]:
#Draw a confusion matrix
labelList=predictions.select("indexed","label").distinct().toPandas()
predictions.groupBy("indexed","prediction").count().show()

+-------+----------+-----+
|indexed|prediction|count|
+-------+----------+-----+
|    1.0|       1.0|   31|
|    0.0|       0.0|   83|
|    0.0|       1.0|   19|
|    1.0|       0.0|   38|
+-------+----------+-----+

