In [1]:
import findspark
findspark.init('directory_to_spark_installation')

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

data = spark.read.csv('bank.csv', 
                      header = True, 
                      inferSchema = True,
                      sep = ';')

In [2]:
data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [3]:
data.columns

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [4]:
data.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

##### All feature 
age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous, poutcome, y

##### Features being Considered
age, job, marital,  default, balance, housing, loan, duration, campaign, pdays, previous, poutcome, y

In [5]:
data.groupBy('poutcome').count().show()

+--------+-----+
|poutcome|count|
+--------+-----+
| success| 1511|
| unknown|36959|
|   other| 1840|
| failure| 4901|
+--------+-----+



'poutcome' seems to be important feature. However, the previous outcome was unknown for majority of the customers. So, It is better to drop this feature.

##### Final Features being Considered
age, job, marital,  default, balance, housing, loan, duration, campaign, pdays

In [6]:
final_data = data.select('age', 'job', 'marital', 'default', 'balance', 'housing', 'loan', 'duration', 'campaign', 'pdays', 'y')
final_data.describe().show()

+-------+------------------+-------+--------+-------+------------------+-------+-----+-----------------+-----------------+------------------+-----+
|summary|               age|    job| marital|default|           balance|housing| loan|         duration|         campaign|             pdays|    y|
+-------+------------------+-------+--------+-------+------------------+-------+-----+-----------------+-----------------+------------------+-----+
|  count|             45211|  45211|   45211|  45211|             45211|  45211|45211|            45211|            45211|             45211|45211|
|   mean| 40.93621021432837|   null|    null|   null|1362.2720576850766|   null| null|258.1630797814691|2.763840658246887| 40.19782796222158| null|
| stddev|10.618762040975405|   null|    null|   null|3044.7658291685257|   null| null|257.5278122651706|3.098020883279192|100.12874599059828| null|
|    min|                18|  admin|divorced|     no|             -8019|     no|   no|                0|        

In [7]:
final_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- y: string (nullable = true)



There is no null values in the columns being considered. Good to go

In [8]:
# Converting string values to numerical column
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

JobIndexer = StringIndexer(inputCol = 'job', outputCol = 'JobIndex')
MaritalIndexer = StringIndexer(inputCol = 'marital', outputCol = 'MaritalIndex')
DefaultIndexer = StringIndexer(inputCol = 'default', outputCol = 'DefaultIndex')
HousingIndexer = StringIndexer(inputCol = 'housing', outputCol = 'HousingIndex')
LoanIndexer = StringIndexer(inputCol = 'loan', outputCol = 'LoanIndex')
LabelIndexer = StringIndexer(inputCol = 'y', outputCol = 'label')

# Using OneHotEncoder to avoid hierarchy in numerical value obtaied in above step
JobEncoder = OneHotEncoder(inputCol = 'JobIndex', outputCol = 'JobVec')
MaritalEncoder = OneHotEncoder(inputCol = 'MaritalIndex', outputCol = 'MaritalkVec')

# All the other columns have binary values(either Yes or No). So no need to hotencode

In [9]:
# Assemble everything together to be ("label","features") format
assembler = VectorAssembler(inputCols = ['age', 'JobVec', 'MaritalkVec', 'DefaultIndex', 'balance', 
                                    'HousingIndex', 'LoanIndex', 'duration', 'campaign', 'pdays'],
                           outputCol = 'features')

In [10]:
# Scaling the features
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures', withStd = True, withMean = False)

In [11]:
# Spliting the data into Training set and Test set
train_data, test_data =  final_data.randomSplit([0.8, 0.2])

In [12]:
# Defining Classifier Model
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier

lr = LogisticRegression()
rcf = RandomForestClassifier(maxDepth = 10, numTrees = 300)

In [13]:
# Set Up the Pipeline
from pyspark.ml import Pipeline

# *************************** For Logistic Regressor ***************************
# pipeline = Pipeline(stages=[JobIndexer, MaritalIndexer, DefaultIndexer, HousingIndexer, 
#                             LoanIndexer, LabelIndexer, JobEncoder, MaritalEncoder, assembler, scaler, lr])

# *************************** For Random Forest Classifier ***************************
pipeline = Pipeline(stages=[JobIndexer, MaritalIndexer, DefaultIndexer, HousingIndexer, 
                            LoanIndexer, LabelIndexer, JobEncoder, MaritalEncoder, assembler, scaler, rcf])

In [14]:
# Fitting the model
model = pipeline.fit(train_data)

# Geting results on Test set
results = model.transform(test_data)

In [15]:
results.select('label', 'prediction').show()

+-----+----------+
|label|prediction|
+-----+----------+
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       1.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  0.0|       0.0|
|  1.0|       0.0|
|  0.0|       1.0|
|  0.0|       1.0|
|  0.0|       0.0|
|  1.0|       1.0|
+-----+----------+
only showing top 20 rows



In [16]:
# Model Evaluation

# Model selection is still in RDD phase. So We will use rdd here instead of spark dataframe. 
# This will be update in future versions of Spark
from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = results.select('prediction', 'label').rdd

In [17]:
metrics = MulticlassMetrics(predictionAndLabels)

# Confusion Matrix
print('Confusion Matrix')
print(metrics.confusionMatrix().toArray())
print('Accuracy: %.2f'%metrics.accuracy)
print('Precision: %.2f'%metrics.weightedPrecision)

Confusion Matrix
[[ 7657.   164.]
 [  845.   251.]]
Accuracy: 0.89
Precision: 0.86
