# Customer Churn Prediction + Clustering

Alternative #3 tried: 12 Features, MinMax scaling, no ClusterChurn, Cross validation with PySpark

## 1. Get ready

__Imports__

In [1]:
# Useful imports - PySpark
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

from pyspark.sql import Row

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import ChiSqSelector
from pyspark.mllib.evaluation import BinaryClassificationMetrics

import numpy as np

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
155,,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

__Load the data__

Let's use a table which has been pre-populated in Db2 local. It is called SAMPLES.TRAINING. 

In [2]:
sparkSession = SparkSession \
        .builder \
        .getOrCreate()

df = sparkSession.read \
        .format("com.ibm.idax.spark.idaxsource") \
        .options(dbtable="SAMPLES.TRAINING") \
        .load()
df.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+----+-----+----------+--------+---------+----------+--------+---------+----------+----------+-----------+------------+---------+----------+-----------+---------+
|CHURN|AREA|VMAIL|VMAIL_MSGS|DAY_MINS|DAY_CALLS|DAY_CHARGE|EVE_MINS|EVE_CALLS|EVE_CHARGE|NIGHT_MINS|NIGHT_CALLS|NIGHT_CHARGE|INTL_MINS|INTL_CALLS|INTL_CHARGE|SVC_CALLS|
+-----+----+-----+----------+--------+---------+----------+--------+---------+----------+----------+-----------+------------+---------+----------+-----------+---------+
|    0| 415|    1|         0|   246.5|      108|     41.91|   216.3|       89|     18.39|     179.6|         99|        8.08|     12.7|         3|       3.43|        2|
|    1| 408|    1|         0|   298.1|      112|     50.68|   201.3|      100|     17.11|     214.7|         88|        9.66|      9.7|         4|       2.62|        2|
|    0| 510|    1|         0|   119.3|       82|     20.28|   185.1|      111|     15.73|     157.0|         74|        7.07|     10.9|         4|       2.

How many records do we have? How many features?

In [3]:
print('Number of records: '+str(df.count()))
print('Number of features: '+str(len(df.columns)))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Number of records: 3333
Number of features: 17

__Split the data__

We split the data into three distinct sets: for training, validation and testing. We use the proportions 70%, 15% and 15% so that we keep a relatively high number of examples for training. Of course these proportions are subjective, you can change them if you want. We have defined a seed so that results can be reproduced.

In [4]:
cross_train, test = df.withColumnRenamed("CHURN", "label").randomSplit([0.85,0.15],1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## 2.  Cross-validation

### a. Evaluation criterion : Area under Precision-Recall Curve

__Define the pipeline__

In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import MinMaxScaler

# Prepare training documents, which are labeled.
# input : cross_train

# Configure an ML pipeline, which consists of tree stages: assembler_12, MinMaxScaler, and lr.

assembler_12 = VectorAssembler(
    inputCols=["SVC_CALLS", "DAY_MINS", "DAY_CHARGE", "VMAIL_MSGS", "VMAIL", 
               "INTL_CALLS", "INTL_CHARGE", "INTL_MINS", "EVE_CHARGE", "EVE_MINS",
               "NIGHT_MINS", "NIGHT_CHARGE"],
    outputCol="features12")

scaler = MinMaxScaler(inputCol = "features12", outputCol = "features")
# scalerModel = scaler.fit(train_12) ISSUE ?

# scaled_train = scalerModel.transform(train_12)

lr = LogisticRegression(maxIter=100)
pipeline = Pipeline(stages=[assembler_12, scaler, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.001, 0.002, 0.005, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.001, 0.002, 0.005, 0.01]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator().setMetricName("areaUnderPR"),
                          numFolds=8)  

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(cross_train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

__Evaluation on test set__

In [10]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
# need for assembling and scaling ?
pred_test = cvModel.transform(test)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
#by default
BinaryClassificationEvaluator().getMetricName()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'areaUnderROC'

In [11]:
binEval_AUROC = BinaryClassificationEvaluator().setMetricName("areaUnderROC").setRawPredictionCol("rawPrediction").setLabelCol("label")
print("Area under ROC: %.3f" % binEval_AUROC.evaluate(pred_test))

binEval_AUPRC = BinaryClassificationEvaluator().setMetricName("areaUnderPR").setRawPredictionCol("rawPrediction").setLabelCol("label")
print("Area under PR curve: %.3f" % binEval_AUPRC.evaluate(pred_test))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Area under ROC: 0.759
Area under PR curve: 0.439

In [12]:
# Metrics (1/2)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(pred_test)
print("Accuracy = %.3f" % accuracy)

evaluatorf1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(pred_test)
print("f1 = %.3f" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(pred_test)
print("weightedPrecision = %.3f" % wp)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy = 0.878
f1 = 0.835
weightedPrecision = 0.893

* Confusion matrix 

<img src="confusionMatrix.png" width=800/>

Sensitivity = TP / (TP + FN), also called Recall

Specificity = TN / (TN + FP)

Precision = TP / (TP + FP)

Negative Predictive Value = TN / (TN + FN)

F1-score: 2 x Precision x Recall / (Precision + Recall)

Recall = 1 - FNR. 

In [14]:
# Metrics (2/2)

def confusion_matrix(pred_DF):
    """
    Input : 
    pred_DF : Saprk DataFrame obtained after a model.transform() transformation
    Output :
    (tn, fp, fn, tp) tuple of integers
    """
    
    # as pandas DF
    label = pred_DF.toPandas()["label"]
    prediction = pred_DF.toPandas()["prediction"]
    
    # true = tp + tn
    true = sum(label)

    # tp : sum(if pred = label = 1) or count(pred+label=2)
    tp = sum(label+prediction==2)
    
    # fn = total number of positive - predicted positives which are rightly predicted
    fn = true - tp

    # tn : sum(if pred=label=0) or count(pred+label=0)
    tn = sum(label+prediction==0)

    # fp = total number of negative - tn
    fp = len(label) - true - tn
    
    return (tn, fp, fn, tp)


def FNR(fn, tp):
    return fn/(fn+tp)

def recall(tp, fn):
    # also called TPR or sensitivity
    return tp/(tp+fn)

def precision(tp, fp):
    return tp/(tp+fp)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Confusion matrix
(tn, fp, fn, tp)=confusion_matrix(pred_test)
# Confusion matrix in format [[tp, fn], [fp, tn]]
print("Confusion matrix: ")
print(np.array([[("tp: "+str(tp)), ("fn: "+str(fn))], [("fp: "+str(fp)), ("tn: "+str(tn))]]))

# Metrics
print("FNR: %.3f" % FNR(fn, tp))
print("Recall: %.3f" % recall(tp, fn))
print("Precision: %.3f" % precision(tp, fp))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Confusion matrix: 
[['tp: 9' 'fn: 63']
 ['fp: 0' 'tn: 445']]
FNR: 0.875
Recall: 0.125
Precision: 1.000

### b. Evaluation criterion: Accuracy

recall :  only weightedRecalll can be set as metric name ! is the same as accuracy for a binary classification.

__Define the pipeline__

In [16]:
### With MulticlassClassificationEvaluator as evaluator

# Prepare training documents, which are labeled.
# input : cross_train

# Configure an ML pipeline, which consists of tree stages: assembler_12, MinMaxScaler, and lr.

assembler_12 = VectorAssembler(
    inputCols=["SVC_CALLS", "DAY_MINS", "DAY_CHARGE", "VMAIL_MSGS", "VMAIL", 
               "INTL_CALLS", "INTL_CHARGE", "INTL_MINS", "EVE_CHARGE", "EVE_MINS",
               "NIGHT_MINS", "NIGHT_CHARGE"],
    outputCol="features12")

scaler = MinMaxScaler(inputCol = "features12", outputCol = "features")

lr = LogisticRegression(maxIter=100)

pipeline = Pipeline(stages=[assembler_12, scaler, lr])

# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.001]) \
    .addGrid(lr.elasticNetParam, [0.001, 0.002]) \
    .build()

# range  : [0.001, 0.002, 0.005, 0.01]
# Evaluator

multiEval = MulticlassClassificationEvaluator().setMetricName("f1")

crossvalR = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=multiEval,
                          numFolds=8)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
# Run cross-validation, and choose the best set of parameters.
cvModelR = crossvalR.fit(cross_train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

__Evaluation on test set__

In [18]:
# Make predictions on test documents. cvModel uses the best model found (lrModel).
pred_testR = cvModelR.transform(test)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [19]:
# Metrics (1/2)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(pred_testR)
print("Accuracy = %.3f" % accuracy)

evaluatorf1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluatorf1.evaluate(pred_testR)
print("f1 = %.3f" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
wp = evaluatorwp.evaluate(pred_testR)
print("weightedPrecision = %.3f" % wp)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy = 0.876
f1 = 0.836
weightedPrecision = 0.871

...

In [20]:
# Confusion matrix
(tn, fp, fn, tp)=confusion_matrix(pred_testR)
# Confusion matrix in format [[tp, fn], [fp, tn]]
print("Confusion matrix: ")
print(np.array([[("tp: "+str(tp)), ("fn: "+str(fn))], [("fp: "+str(fp)), ("tn: "+str(tn))]]))

# Metrics
print("FNR: %.3f" % FNR(fn, tp))
print("Recall: %.3f" % recall(tp, fn))
print("Precision: %.3f" % precision(tp, fp))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Confusion matrix: 
[['tp: 10' 'fn: 62']
 ['fp: 2' 'tn: 443']]
FNR: 0.861
Recall: 0.139
Precision: 0.833

...