# Random Forest Example

## 1. Import spark modules

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [2]:
spark = SparkSession\
        .builder\
        .appName("PythonPi")\
        .getOrCreate()

**Setup spark configuration and create a spark context**

In [3]:
sc = spark.sparkContext

## 2. Load the data

In [4]:
bankData = sc.textFile("./input/bank.csv")

In [5]:
bankData.cache()

./input/bank.csv MapPartitionsRDD[1] at textFile at <unknown>:0

In [6]:
bankData.count()

542

## 3. Cleanup Data

### Remove the header line

In [7]:
firstLine = bankData.first()

In [8]:
dataLines = bankData.filter(lambda x: x != firstLine)

In [9]:
dataLines.count()

541

**A map function to convert string to numberic**

In [10]:
import math
from pyspark.ml.linalg import Vectors

In [11]:
def transformToNumeric(inputStr):
    
    '''a function to convert input string to numbers'''
    attList = inputStr.replace("\"","").split(";")
    
    # convert the age to float
    age = float(attList[0])
    
    # convert the outcome to float
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    # create indicator variable for single/married
    single = 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    # create indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    # convert default to float
    default = 0.0 if attList[4] == "no" else 1.0
    # convert balance amount to float
    balance = float(attList[5])
    # convert loan to float
    loan = 0.0 if attList[7] == "no" else 1.0
    
    # create a Row using the attributes
    values = Row(OUTCOME = outcome,
                AGE = age,
                SINGLE = single,
                MARRIED = married,
                DIVORCED = divorced,
                PRIMARY = primary,
                SECONDARY = secondary,
                TERTIARY = tertiary,
                DEFAULT = default,
                BALANCE = balance,
                LOAN = loan)
    
    return values

**Transform the data using the map function**

In [12]:
bankRows = dataLines.map(transformToNumeric)

**Create a dataframe**

In [13]:
bankData = spark.createDataFrame(bankRows)

In [14]:
bankData.show()

+----+-------+-------+--------+----+-------+-------+-------+---------+------+--------+
| AGE|BALANCE|DEFAULT|DIVORCED|LOAN|MARRIED|OUTCOME|PRIMARY|SECONDARY|SINGLE|TERTIARY|
+----+-------+-------+--------+----+-------+-------+-------+---------+------+--------+
|30.0| 1787.0|    0.0|     0.0| 0.0|    1.0|    0.0|    1.0|      0.0|   0.0|     0.0|
|33.0| 4789.0|    0.0|     0.0| 1.0|    1.0|    1.0|    0.0|      1.0|   0.0|     0.0|
|35.0| 1350.0|    0.0|     0.0| 0.0|    0.0|    1.0|    0.0|      0.0|   1.0|     1.0|
|30.0| 1476.0|    0.0|     0.0| 1.0|    1.0|    1.0|    0.0|      0.0|   0.0|     1.0|
|59.0|    0.0|    0.0|     0.0| 0.0|    1.0|    0.0|    0.0|      1.0|   0.0|     0.0|
|35.0|  747.0|    0.0|     0.0| 0.0|    0.0|    1.0|    0.0|      0.0|   1.0|     1.0|
|36.0|  307.0|    0.0|     0.0| 0.0|    1.0|    1.0|    0.0|      0.0|   0.0|     1.0|
|39.0|  147.0|    0.0|     0.0| 0.0|    1.0|    0.0|    0.0|      1.0|   0.0|     0.0|
|41.0|  221.0|    0.0|     0.0| 0.0|    1.0

## 4. Perform Data Analytics

**Describe the data**

In [15]:
bankData.describe().show()

+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|summary|               AGE|           BALANCE|             DEFAULT|           DIVORCED|               LOAN|           MARRIED|            OUTCOME|           PRIMARY|         SECONDARY|            SINGLE|          TERTIARY|
+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+
|  count|               541|               541|                 541|                541|                541|               541|                541|               541|               541|               541|               541|
|   mean| 41.26987060998152|1444.7818853974122|0.022181146025878003|0.10905730129390019|0.16266173752310

**Correlation between the target variables and the feature variables**

In [16]:
# iterate through each column in the dataframe
for i in bankData.columns:
    # if data is not an instance of string
    if not(isinstance(bankData.select(i).take(1)[0][0], str)):
        print("Correlation to OUTCOME for", i, bankData.stat.corr("OUTCOME", i))

Correlation to OUTCOME for AGE -0.1823210432736525
Correlation to OUTCOME for BALANCE 0.036574866119976804
Correlation to OUTCOME for DEFAULT -0.04536965206737378
Correlation to OUTCOME for DIVORCED -0.07812659940926987
Correlation to OUTCOME for LOAN -0.030420586112717318
Correlation to OUTCOME for MARRIED -0.3753241299133561
Correlation to OUTCOME for OUTCOME 1.0
Correlation to OUTCOME for PRIMARY -0.12561548832677982
Correlation to OUTCOME for SECONDARY 0.026392774894072973
Correlation to OUTCOME for SINGLE 0.46323284934360515
Correlation to OUTCOME for TERTIARY 0.08494840766635618


## 5. Prepare data for machine learning

**A function to transform data to labelled points**

In [17]:
def transformToLabeledPoint(row):
    
    '''a function to transform data to labelled points'''
    lp = (row["OUTCOME"],
          Vectors.dense([
              row["AGE"],
              row["BALANCE"],
              row["DEFAULT"],
              row["DIVORCED"],
              row["LOAN"],
              row["MARRIED"],
              row["PRIMARY"],
              row["SECONDARY"],
              row["SINGLE"],
              row["TERTIARY"]]))
    
    return lp

**Convert the data to labelled points**

In [18]:
bankLp = bankData.rdd.map(transformToLabeledPoint)

**Create a dataframe using the rdd**

In [19]:
bankDF = spark.createDataFrame(bankLp, ["label", "features"])

In [20]:
bankDF.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[30.0,1787.0,0.0,...|
|  1.0|[33.0,4789.0,0.0,...|
|  1.0|[35.0,1350.0,0.0,...|
|  1.0|[30.0,1476.0,0.0,...|
|  0.0|[59.0,0.0,0.0,0.0...|
|  1.0|[35.0,747.0,0.0,0...|
|  1.0|[36.0,307.0,0.0,0...|
|  0.0|[39.0,147.0,0.0,0...|
|  0.0|[41.0,221.0,0.0,0...|
|  1.0|[43.0,-88.0,0.0,0...|
|  0.0|[39.0,9374.0,0.0,...|
|  0.0|[43.0,264.0,0.0,0...|
|  0.0|[36.0,1109.0,0.0,...|
|  1.0|[20.0,502.0,0.0,0...|
|  1.0|[31.0,360.0,0.0,0...|
|  0.0|[40.0,194.0,0.0,0...|
|  0.0|[56.0,4073.0,0.0,...|
|  1.0|[37.0,2317.0,0.0,...|
|  0.0|[25.0,-221.0,0.0,...|
|  1.0|[31.0,132.0,0.0,0...|
+-----+--------------------+
only showing top 20 rows



## 6. Perform PCA

In [21]:
from pyspark.ml.feature import PCA

**Create and fit pca model**

In [22]:
bankPCA = PCA(k =3, inputCol = "features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)

**Store and show the pca results**

In [23]:
pcaResult = pcaModel.transform(bankDF).select("label", "pcaFeatures")

In [24]:
pcaResult.show(truncate=False)

+-----+------------------------------------------------------------+
|label|pcaFeatures                                                 |
+-----+------------------------------------------------------------+
|0.0  |[-1787.018897197381,28.86209683775489,-0.06459982604876296] |
|1.0  |[-4789.020177138492,29.922562636340885,-0.9830243513096447] |
|1.0  |[-1350.022213163262,34.10110809796657,0.8951427168301616]   |
|1.0  |[-1476.0189517184556,29.051333993596376,0.3952723868021922] |
|0.0  |[-0.037889185366455545,58.9897182000177,-0.729079238366194] |
|1.0  |[-747.0223377634923,34.488291981817554,0.9045654956970024]  |
|1.0  |[-307.0230691022592,35.799850539655154,0.5170631523785959]  |
|0.0  |[-147.0250121617634,38.90107856650326,-0.8069627548799431]  |
|0.0  |[-221.0262985348787,40.853633675694894,0.53730363658032]    |
|1.0  |[87.9723868768871,43.06265944115107,-0.06701642871171626]   |
|0.0  |[-9374.023105550941,32.9764588379908,-0.9511484606914545]   |
|0.0  |[-264.02755731528384,42.824

## 7. Perform Random Forest Classification

In [25]:
from pyspark.ml.feature import StringIndexer

**Convert the label into numeric values**

In [26]:
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")

In [27]:
si_model = stringIndexer.fit(pcaResult)

In [28]:
td = si_model.transform(pcaResult)

In [29]:
td.show()

+-----+--------------------+-------+
|label|         pcaFeatures|indexed|
+-----+--------------------+-------+
|  0.0|[-1787.0188971973...|    0.0|
|  1.0|[-4789.0201771384...|    1.0|
|  1.0|[-1350.0222131632...|    1.0|
|  1.0|[-1476.0189517184...|    1.0|
|  0.0|[-0.0378891853664...|    0.0|
|  1.0|[-747.02233776349...|    1.0|
|  1.0|[-307.02306910225...|    1.0|
|  0.0|[-147.02501216176...|    0.0|
|  0.0|[-221.02629853487...|    0.0|
|  1.0|[87.9723868768871...|    1.0|
|  0.0|[-9374.0231055509...|    0.0|
|  0.0|[-264.02755731528...|    0.0|
|  0.0|[-1109.0229033818...|    0.0|
|  1.0|[-502.01273640329...|    1.0|
|  1.0|[-360.01980765510...|    1.0|
|  0.0|[-194.02563994730...|    0.0|
|  0.0|[-4073.0351205683...|    0.0|
|  1.0|[-2317.0232980131...|    1.0|
|  0.0|[220.983897631329...|    0.0|
|  1.0|[-132.01987660190...|    1.0|
+-----+--------------------+-------+
only showing top 20 rows



**Import classifier**

In [31]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

**Train test split**

In [33]:
(trainingData, testData) = td.randomSplit([0.75, 0.25])

In [34]:
trainingData.count()

378

In [35]:
testData.count()

163

**Create and fit model**

In [32]:
rmClassifier = RandomForestClassifier(labelCol="indexed", featuresCol="pcaFeatures")

In [36]:
rmModel = rmClassifier.fit(trainingData)

**Compute predictions using test set**

In [37]:
predictions = rmModel.transform(testData)

In [38]:
predictions.select("prediction", "indexed", "label", "pcaFeatures").collect()

[Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-5996.0302, 45.1426, -0.8606])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-5883.0251, 37.2181, 0.4488])),
 Row(prediction=1.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-4030.0229, 34.4084, -0.8752])),
 Row(prediction=1.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-3096.0186, 27.9808, -0.4889])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-2693.02, 30.2683, -0.8732])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1877.0356, 54.785, 0.2511])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1811.0266, 40.8265, -0.5194])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1808.0189, 28.8371, -0.8669])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1516.0253, 39.0221, -0.9054])),
 Row(prediction=0.0, indexed=0.0, label=0.0, pcaFeatures=DenseVector([-1269.02

**Evaluate accuracy**

In [39]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="indexed", metricName="accuracy")

In [40]:
evaluator.evaluate(predictions)

0.6993865030674846

**Confusion Matrix**

In [41]:
predictions.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   29|
|  0.0|       1.0|   22|
|  1.0|       0.0|   27|
|  0.0|       0.0|   85|
+-----+----------+-----+

