In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
'''
 This is the problem of CUSTOMER CHURN;
 
 * You need to help out a marketing agency predict Customer Churn.

 * A marketing agency has many customers that use their service to produce ads 
    for clients/customer websites.

 * They have noticed that they have quite a bit of churn in clients.

 * They currently randomly assign account managers, but want you to create a 
   machine learning model that will help predict which customers will churn
   (stop buying their service) so that they can correctly assign the customers
   most at risk to churn an account manager.

 * Luckily they have some historical data, they want you to create a classification
   algorithm that will help classify whether or not a customer churned.

 * Then the company can test this against incoming data for future customers
   who will churn and assign them an account manager.
 

 * Attributes in the dataset:
     
     $ Name: Name of the latest contact at company.
     $ Age: Customer age.
     $ Total_Purchase: Total Ads purchased.
     $ Account_Manager: Binary; 0 = No manager , 1 = Account manager assigned
     $ Years: Total Years as a customer.
     $ Num_sites: Number of websites that use the service.
     $ Onboard_date: Date when the name of the latest contact was onboarded.
     $ Location: Client HQ address.
     $ Company: Name of client company.
     
     $ Churn: Target Column; 0 or 1 indicating whether a customer has churned or not. 
     
 * Remember that currently the account manager is randomly assigned,
   So it won't be much helpful in the dataset to use in the algorithm.
   
'''

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("logRegConsult").getOrCreate()

In [3]:
## Lets Import the .csv file

data = spark.read.csv("/Users/jaskiratsinghp/Desktop/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/customer_churn.csv" , 
                     inferSchema = True , header = True)

data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [4]:
data.describe().show()

+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|        Names|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|            Location|             Company|              Churn|
+-------+-------------+-----------------+-----------------+------------------+-----------------+------------------+--------------------+--------------------+-------------------+
|  count|          900|              900|              900|               900|              900|               900|                 900|                 900|                900|
|   mean|         null|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|                null|                null|0.16666666666666666|
| stddev|         null|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.764835592035

In [5]:
data.head(1)

[Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date=datetime.datetime(2013, 8, 30, 7, 0, 40), Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)]

In [6]:
data.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [7]:
## Lets convert the data in a way that is accepted by MLlib library.

from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols = ["Age",
                                        "Total_Purchase",
                                        "Account_Manager",
                                        "Years",
                                        "Num_Sites"] , outputCol = "features")

output = assembler.transform(data)

In [9]:
finalData = output.select("features" , "churn")

In [10]:
train_churn , test_churn = finalData.randomSplit([0.7 , 0.3])

## Lets import Logistic Regression library
from pyspark.ml.classification import LogisticRegression

logisticReg_churn = LogisticRegression(featuresCol = "features" , 
                                       labelCol = "churn")

In [11]:
fittedChurnModel = logisticReg_churn.fit(train_churn)

In [13]:
trainSummary = fittedChurnModel.summary

trainSummary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|              churn|         prediction|
+-------+-------------------+-------------------+
|  count|                622|                622|
|   mean|0.14790996784565916|0.11093247588424437|
| stddev| 0.3552964400597924|0.31430125748604026|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [15]:
## Lets analyse some Evaluation matrices

from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictionAndLabels = fittedChurnModel.evaluate(test_churn)

predictionAndLabels.predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|churn|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[26.0,8787.39,1.0...|    1|[0.43345902972804...|[0.60669934958693...|       0.0|
|[28.0,9090.43,1.0...|    0|[1.33139100417859...|[0.79107063010370...|       0.0|
|[28.0,11245.38,0....|    0|[3.45562572274596...|[0.96939846956511...|       0.0|
|[29.0,5900.78,1.0...|    0|[3.64287878407945...|[0.97449087152586...|       0.0|
|[30.0,6744.87,0.0...|    0|[2.98895226894464...|[0.95207252436678...|       0.0|
|[30.0,8403.78,1.0...|    0|[5.32240203723663...|[0.99514269333554...|       0.0|
|[30.0,8677.28,1.0...|    0|[3.93240948234531...|[0.98078023963537...|       0.0|
|[30.0,10960.52,1....|    0|[2.25735630862558...|[0.90528318906589...|       0.0|
|[30.0,12788.37,0....|    0|[2.17027240169814...|[0.89754801812296...|       0.0|
|[30.0,13473.35,

In [16]:
churnEval = BinaryClassificationEvaluator(rawPredictionCol = "prediction",
                                         labelCol = "churn")

In [17]:
AUC = churnEval.evaluate(predictionAndLabels.predictions)

In [18]:
AUC

0.7536050156739812

In [20]:
## Lets try to predict on New Dataset

final_lr_model = logisticReg_churn.fit(finalData)

In [23]:
newCustomerData = spark.read.csv("/Users/jaskiratsinghp/Desktop/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Logistic_Regression/new_customers.csv" , 
                                inferSchema = True , header = True)

newCustomerData.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [25]:
testNewCustomers = assembler.transform(newCustomerData)

In [26]:
testNewCustomers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- features: vector (nullable = true)



In [27]:
finalResults = final_lr_model.transform(testNewCustomers)

In [30]:
finalResults.select("prediction" , "Company").show()

+----------+----------------+
|prediction|         Company|
+----------+----------------+
|       0.0|        King Ltd|
|       1.0|   Cannon-Benson|
|       1.0|Barron-Robertson|
|       1.0|   Sexton-Golden|
|       0.0|        Wood LLC|
|       1.0|   Parks-Robbins|
+----------+----------------+

