In [3]:
# Import required libraries

from pyspark.sql import SparkSession
import pandas
spark = SparkSession.builder.appName('Churn').getOrCreate()


# save the file location 
file_path = "./churn.csv"


In [8]:
# Read the input file
df = spark.read.csv(file_path, inferSchema = True, 
                    header = True, sep = "," ,
                    nanValue = ' ', nullValue = ' ')

In [9]:
display(df)

DataFrame[churn: string, accountlength: int, internationalplan: string, voicemailplan: string, numbervmailmessages: int, totaldayminutes: double, totaldaycalls: int, totaldaycharge: double, totaleveminutes: double, totalevecalls: int, totalevecharge: double, totalnightminutes: double, totalnightcalls: int, totalnightcharge: double, totalintlminutes: double, totalintlcalls: int, totalintlcharge: double, numbercustomerservicecalls: int]

In [10]:
# Lets look at the schema from the file
df.printSchema()

root
 |-- churn: string (nullable = true)
 |-- accountlength: integer (nullable = true)
 |-- internationalplan: string (nullable = true)
 |-- voicemailplan: string (nullable = true)
 |-- numbervmailmessages: integer (nullable = true)
 |-- totaldayminutes: double (nullable = true)
 |-- totaldaycalls: integer (nullable = true)
 |-- totaldaycharge: double (nullable = true)
 |-- totaleveminutes: double (nullable = true)
 |-- totalevecalls: integer (nullable = true)
 |-- totalevecharge: double (nullable = true)
 |-- totalnightminutes: double (nullable = true)
 |-- totalnightcalls: integer (nullable = true)
 |-- totalnightcharge: double (nullable = true)
 |-- totalintlminutes: double (nullable = true)
 |-- totalintlcalls: integer (nullable = true)
 |-- totalintlcharge: double (nullable = true)
 |-- numbercustomerservicecalls: integer (nullable = true)



In [None]:
# In some situations, spark may not be able to infer the data schema as easily as above
# especially in cases where read from json formats.
# we can force it to enforce the schema which we need, while it reads the dataset.
# Lets explore that a bit before we move on with our analysis

from pyspark.sql.types import StructField, StringType, IntegerType, StructType
data_schema = [StructField('age',IntegerType(), True),
               StructField('name', StringType(), True)]
final_struc = StructType(fields=data_schema)

# once we defined the final structure, we can just use it while reading the dataset
df = spark.read.json('data.json', schema=final_struc)

In [11]:
# Check for NUll Values
from pyspark.sql.functions import isnan, when, count, col
df.select([count(when(isnan(c) | col(c).isNull(),c)).alias(c) for c in df.columns]).show()

+-----+-------------+-----------------+-------------+-------------------+---------------+-------------+--------------+---------------+-------------+--------------+-----------------+---------------+----------------+----------------+--------------+---------------+--------------------------+
|churn|accountlength|internationalplan|voicemailplan|numbervmailmessages|totaldayminutes|totaldaycalls|totaldaycharge|totaleveminutes|totalevecalls|totalevecharge|totalnightminutes|totalnightcalls|totalnightcharge|totalintlminutes|totalintlcalls|totalintlcharge|numbercustomerservicecalls|
+-----+-------------+-----------------+-------------+-------------------+---------------+-------------+--------------+---------------+-------------+--------------+-----------------+---------------+----------------+----------------+--------------+---------------+--------------------------+
|    0|            0|                0|            0|                  0|              0|            0|             0|            

In [None]:
# We do see that some of the fields have null values so we will need to impute them 
# before feeding them into the model

In [12]:
# Check the count of output variable in the dataset
df.groupBy('churn').count().show()

+-----+-----+
|churn|count|
+-----+-----+
|   No| 4293|
|  Yes|  707|
+-----+-----+



In [13]:
# Check the distribution of data for the AccountLength, totaldaycharge, totalevecharge, totalNightcharge
df.select('accountlength', 'totaldaycharge', 'totalnightcharge', 'totalevecharge').describe().show()

+-------+-----------------+------------------+-----------------+------------------+
|summary|    accountlength|    totaldaycharge| totalnightcharge|    totalevecharge|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             5000|              5000|             4994|              4993|
|   mean|         100.2586|30.649668000000023|9.017503003604375|17.053647105948343|
| stddev|39.69455954726711| 9.162068691639355|2.273414183416672| 4.292381665441585|
|    min|                1|               0.0|              0.0|               0.0|
|    max|              243|             59.76|            17.77|             30.91|
+-------+-----------------+------------------+-----------------+------------------+



In [14]:
# Lets create a sql view for the data frame so that we can run some queries interactively.
df.registerTempTable("Churn_data")
spark.sql("select count(*) from Churn_data").show()

+--------+
|count(1)|
+--------+
|    5000|
+--------+



In [21]:
# Get some insights on the effect of certain predictor variables on the dependent variable
# First, lets look at churn count by TotalCustomerservicecalls
query = "select numbercustomerservicecalls, churn, count(*) as churn_count \
        from Churn_data \
        group by numbercustomerservicecalls, churn \
        order by numbercustomerservicecalls,churn desc, churn_count"

spark.sql(query).show()

+--------------------------+-----+-----------+
|numbercustomerservicecalls|churn|churn_count|
+--------------------------+-----+-----------+
|                         0|  Yes|        121|
|                         0|   No|        902|
|                         1|  Yes|        190|
|                         1|   No|       1596|
|                         2|  Yes|        122|
|                         2|   No|       1005|
|                         3|  Yes|         73|
|                         3|   No|        592|
|                         4|  Yes|        111|
|                         4|   No|        141|
|                         5|  Yes|         58|
|                         5|   No|         38|
|                         6|  Yes|         22|
|                         6|   No|         12|
|                         7|  Yes|          7|
|                         7|   No|          6|
|                         8|  Yes|          1|
|                         8|   No|          1|
|            

In [22]:
# Next lets look at Voicemail plan predictor varible with the churn variable

query = "select voicemailplan, churn, count(*) as plan_count \
         from Churn_data \
         group by voicemailplan, churn"
spark.sql(query).show()

+-------------+-----+----------+
|voicemailplan|churn|plan_count|
+-------------+-----+----------+
|          yes|   No|      1221|
|           no|   No|      3072|
|          yes|  Yes|       102|
|           no|  Yes|       605|
+-------------+-----+----------+



Looks like Customers who are not on voice mail plan seem to churning more

In [31]:
# Let us look closely at International plan Variable and understand how it is  
# impacting churn

df.stat.crosstab('internationalplan', 'churn').show()

+-----------------------+----+---+
|internationalplan_churn|  No|Yes|
+-----------------------+----+---+
|                     no|4019|508|
|                    yes| 274|199|
+-----------------------+----+---+



In [26]:
# This tells us that customers who have international plan tend to churn less.

In [29]:
# Lets start building our Model - We will use the pipeline functionality in Spark ML to transform our dataset
# As we have less data overall, I am choosing a train/test split of 0.8,0.2

In [28]:
churn_df = df
(train_data, test_data) = churn_df.randomSplit([0.8, 0.2], 24)

print("Records for training: " + str(train_data.count()))
print("Records for validation: " + str(test_data.count()))

Records for training: 4016
Records for validation: 984


In [32]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

catColumns = ["internationalplan", "voicemailplan"]

In [50]:
# Let us set the stages for our pipeline and then transform the variables accordingly
# The first stage for our Pipeline is the transformer which performs StringIndexing and categorical Encoding
stages = []

for catCol in catColumns:
    
    stringIndexer = StringIndexer(inputCol= catCol, outputCol=catCol + "Index")
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols = [catCol + "catVec"])
    
    stages += [stringIndexer, encoder]

In [51]:
# The Second stage for our Transformer pipeline is the Imputer to impute missing values

from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols = ["totalevecharge", "totalnightcharge"], \
                  outputCols=["totalevecharge", "totalnightcharge"])

stages += [imputer]

In [52]:
# Create the label_Idx for the Output Column

label_Idx = StringIndexer(inputCol= "churn", outputCol = "label")
stages += [label_Idx]

In [38]:
# Let's Look at Account Length variable and see if it can show any insights in current form

df.stat.crosstab('accountlength', 'churn').show()

+-------------------+---+---+
|accountlength_churn| No|Yes|
+-------------------+---+---+
|                 69| 31|  5|
|                138| 34|  5|
|                101| 48|  7|
|                 88| 40| 10|
|                170|  8|  2|
|                115| 39|  9|
|                217|  3|  0|
|                  5|  2|  0|
|                120| 47|  5|
|                202|  2|  0|
|                 10|  3|  0|
|                 56| 22|  2|
|                142| 26|  1|
|                153| 11|  1|
|                174| 10|  2|
|                185|  9|  1|
|                 42| 24|  2|
|                 24|  7|  3|
|                 37| 13|  2|
|                 25| 10|  2|
+-------------------+---+---+
only showing top 20 rows



In [39]:
# Lets try a different view - Understand description statistics for this variable

df.select('accountlength').describe().show()


+-------+-----------------+
|summary|    accountlength|
+-------+-----------------+
|  count|             5000|
|   mean|         100.2586|
| stddev|39.69455954726711|
|    min|                1|
|    max|              243|
+-------+-----------------+



In [53]:
# Create bins for Account Length variable as using it as a numeric variable might not add much value

from pyspark.ml.feature import QuantileDiscretizer
acctlen_bin = QuantileDiscretizer(numBuckets=3, inputCol = "accountlength", outputCol="acctlen_bin")
stages += [acctlen_bin]

In [54]:
# Let us create a vector assembler which will convert the input values into array format which 
# can be fed to the model

numericCols = ["numbervmailmessages", "totaldayminutes","totaldaycalls", \
               "totaldaycharge", "totaleveminutes", "totalevecalls", \
               "totalevecharge", "totalnightminutes", "totalnightcalls", \
               "totalnightcharge", "totalintlminutes", "totalintlcalls", \
               "totalintlcharge", "numbercustomerservicecalls"]
assembleInputs = assemblerInputs = [c + "catVec" for c in catColumns] + numericCols
assembler = VectorAssembler(inputCols=assembleInputs, outputCol="features")
stages += [assembler]

In [55]:
# Create our stages into a pipeline
pipeline = Pipeline().setStages(stages)
pipelineModel = pipeline.fit(train_data)

In [56]:
# Run our data through the pipeline

trainfinalDF = pipelineModel.transform(train_data)
testfinalDF = pipelineModel.transform(test_data)

In [57]:
# Lets fit the Logistic Regression Model 

from pyspark.ml.classification import LogisticRegression

logistic = LogisticRegression(labelCol="label", featuresCol="features", maxIter = 25)

#Train the above model with our training data
logistic_model = logistic.fit(trainfinalDF)

In [60]:
# Lets predict with the test dataset and look at model metrics

from pyspark.mllib.evaluation import BinaryClassificationMetrics

predictions = logistic_model.transform(testfinalDF)

results = (predictions.select(['prediction', 'label'])).collect()
results_list = [(float(i[0]), float(i[1])) for i in results]
preds_labels = spark.sparkContext.parallelize(results_list)

metrics = BinaryClassificationMetrics(preds_labels)

print("Area under PR = ", metrics.areaUnderPR)
print("Area under ROC = ", metrics.areaUnderROC)

Area under PR =  0.37008286429018133
Area under ROC =  0.5643364928909952


This is a decent initial model. However, I am sure we can further improve the metrics for this dataset. Also, we can try out multiple additional tree based models and also look at cross validation. I will be expanding this notebook in future. 