In [1]:
! wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip'
! unzip 'bank.zip'

--2021-05-03 09:59:22--  https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 579043 (565K) [application/x-httpd-php]
Saving to: ‘bank.zip’


2021-05-03 09:59:25 (441 KB/s) - ‘bank.zip’ saved [579043/579043]

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [142]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

spark = SparkSession.builder.master('local[*]').appName('term-deposit-prediction').getOrCreate()
df = spark.read.csv('bank.csv', sep= ';', header = True, inferSchema = True)

In [143]:
df.select(df.columns[:8]).show(4, truncate= False)

+---+----------+-------+---------+-------+-------+-------+----+
|age|job       |marital|education|default|balance|housing|loan|
+---+----------+-------+---------+-------+-------+-------+----+
|30 |unemployed|married|primary  |no     |1787   |no     |no  |
|33 |services  |married|secondary|no     |4789   |yes    |yes |
|35 |management|single |tertiary |no     |1350   |yes    |no  |
|30 |management|married|tertiary |no     |1476   |yes    |yes |
+---+----------+-------+---------+-------+-------+-------+----+
only showing top 4 rows



In [144]:
# Fidning the number of datapoints in each of the classes
df= df.withColumnRenamed("y", "deposit")
df.groupby('deposit').count().show()

+-------+-----+
|deposit|count|
+-------+-----+
|     no| 4000|
|    yes|  521|
+-------+-----+



In [145]:
# Extracting a subset of the data for finding summary statistics
df.select(df.columns[:8]).describe().show()

+-------+------------------+-------+--------+---------+-------+------------------+-------+----+
|summary|               age|    job| marital|education|default|           balance|housing|loan|
+-------+------------------+-------+--------+---------+-------+------------------+-------+----+
|  count|              4521|   4521|    4521|     4521|   4521|              4521|   4521|4521|
|   mean| 41.17009511170095|   null|    null|     null|   null|1422.6578190665782|   null|null|
| stddev|10.576210958711263|   null|    null|     null|   null|3009.6381424673395|   null|null|
|    min|                19| admin.|divorced|  primary|     no|             -3313|     no|  no|
|    max|                87|unknown|  single|  unknown|    yes|             71188|    yes| yes|
+-------+------------------+-------+--------+---------+-------+------------------+-------+----+



In [146]:
# create a list which has the names of all the numeric features
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int']
df.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
age,4521,41.17009511170095,10.576210958711263,19,87
balance,4521,1422.6578190665782,3009.6381424673395,-3313,71188
day,4521,15.915284229152842,8.247667327229934,1,31
duration,4521,263.96129174961294,259.85663262468216,4,3025
campaign,4521,2.793629727936297,3.1098066601885823,1,50
pdays,4521,39.766644547666445,100.12112444301656,-1,871
previous,4521,0.5425790754257908,1.6935623506071211,0,25


In [147]:
# removing the features day and month due to non-relevance
df = df.select('age', 'job', 'marital', 'education', 'default', 
               'balance', 'housing', 'loan', 'contact', 'duration', 
               'campaign', 'pdays', 'previous', 'poutcome', 'deposit')

In [148]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

In [149]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

stages = []
numericCols = [field for (field, dataType) in df.dtypes
               if ((dataType == "int") & (field != "deposit"))]
categoricalColumns = [field for (field, dataType) in df.dtypes
                   if dataType == "string"]
categoricalColumns.remove("deposit")

for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol = 'deposit', outputCol = 'label')
stages += [label_stringIdx]


assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [150]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 3204
Test Dataset Count: 1317


In [151]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

lr = LogisticRegression(labelCol="label", featuresCol="features")
stages+= [lr]
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(train)
pred = pipelineModel.transform(test)
pred.select("features", "label", "prediction").show(5)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(30,[10,12,15,16,...|  1.0|       0.0|
|(30,[4,12,13,16,1...|  0.0|       0.0|
|(30,[10,12,13,16,...|  1.0|       1.0|
|(30,[10,12,13,16,...|  0.0|       0.0|
|(30,[10,12,13,16,...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows



In [152]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="label", metricName="areaUnderROC")
accuracy = evaluator.evaluate(pred)
print("Area Under ROC = %g " % (accuracy))

Area Under ROC = 0.862333 


## Random Forest Model

In [153]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
stages.remove(stages[-1])
stages+= [rf]
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(train)
pred = pipelineModel.transform(test)
pred.select("features", "label", "prediction").show(5)

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(30,[10,12,15,16,...|  1.0|       0.0|
|(30,[4,12,13,16,1...|  0.0|       0.0|
|(30,[10,12,13,16,...|  1.0|       0.0|
|(30,[10,12,13,16,...|  0.0|       0.0|
|(30,[10,12,13,16,...|  0.0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows



In [154]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(
    labelCol="label", metricName="areaUnderROC")
accuracy = evaluator.evaluate(pred)
print("Area Under ROC = %g " % (accuracy))

Area Under ROC = 0.86742 


In [155]:
from pyspark.ml.tuning import ParamGridBuilder
paramGrid = (ParamGridBuilder()
    .addGrid(rf.maxDepth, [2, 4, 6])
    .addGrid(rf.numTrees, [10, 100])
    .build())

In [156]:
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=pipeline,
    evaluator=evaluator,
    estimatorParamMaps=paramGrid,
    numFolds=3,
    seed=42)
cvModel = cv.fit(train)

In [157]:
list(zip(cvModel.getEstimatorParamMaps(), cvModel.avgMetrics))

[({Param(parent='RandomForestClassifier_b77223118c4d', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestClassifier_b77223118c4d', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  0.6456131717826524),
 ({Param(parent='RandomForestClassifier_b77223118c4d', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 2,
   Param(parent='RandomForestClassifier_b77223118c4d', name='numTrees', doc='Number of trees to train (>= 1).'): 100},
  0.8390147890914366),
 ({Param(parent='RandomForestClassifier_b77223118c4d', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 4,
   Param(parent='RandomForestClassifier_b77223118c4d', name='numTrees', doc='Number of trees to train (>= 1).'): 10},
  0.8340064