In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

First, we need to upload our database.

In [None]:
df = spark.read.options(header = True, inferSchema = True).csv("drive/MyDrive/Colab Notebooks/customer_churn.csv")

We check that our database upload correctly.

In [None]:
df.toPandas()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn
0,Cameron Williams,42.0,11066.80,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1
1,Kevin Mueller,41.0,11916.22,0,6.50,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1
...,...,...,...,...,...,...,...,...,...,...
895,Paul Miller,42.0,12800.82,1,3.62,8.0,2007-12-01 13:29:34,"9316 Julian Fort Suite 328 North Leslie, ME 43961",Evans-Lucero,0
896,Natalie Hodges,52.0,9893.92,0,6.91,7.0,2008-12-28 15:23:58,"8419 William Square Apt. 695 Martinville, RI 3...",Perry and Sons,0
897,Ana Smith,45.0,12056.18,0,5.46,4.0,2014-06-20 05:10:09,Unit 8633 Box 8738 DPO AA 14126-5026,Schneider-Smith,0
898,Justin Leonard,51.0,6517.93,1,5.47,10.0,2012-05-30 00:15:43,"49800 Torres Ways Suite 886 West Bradleybury, ...",Robles-Abbott,0


Once we did it, we need to select what columns are the features. But before that, we need to convert the company column into binary numbers. For that, first we need to import it from pyspark.

In [None]:
from pyspark.ml.feature import (VectorAssembler, OneHotEncoder, VectorIndexer, StringIndexer)

In [None]:
company_indexer = StringIndexer(inputCol='Company', outputCol='company_index')
df = company_indexer.fit(df).transform(df)

company_encoder = OneHotEncoder(inputCol='company_index', outputCol='company_vec') # We use this function because it classifies better
df = company_encoder.fit(df).transform(df)

df.toPandas()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn,company_index,company_vec
0,Cameron Williams,42.0,11066.80,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1,343.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Kevin Mueller,41.0,11916.22,0,6.50,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1,515.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1,14.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1,474.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
895,Paul Miller,42.0,12800.82,1,3.62,8.0,2007-12-01 13:29:34,"9316 Julian Fort Suite 328 North Leslie, ME 43961",Evans-Lucero,0,240.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
896,Natalie Hodges,52.0,9893.92,0,6.91,7.0,2008-12-28 15:23:58,"8419 William Square Apt. 695 Martinville, RI 3...",Perry and Sons,0,11.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
897,Ana Smith,45.0,12056.18,0,5.46,4.0,2014-06-20 05:10:09,Unit 8633 Box 8738 DPO AA 14126-5026,Schneider-Smith,0,692.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
898,Justin Leonard,51.0,6517.93,1,5.47,10.0,2012-05-30 00:15:43,"49800 Torres Ways Suite 886 West Bradleybury, ...",Robles-Abbott,0,660.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 
                                       'Years', 'Num_Sites'], outputCol='features', 
                            handleInvalid='skip')
output = assembler.transform(df)
output.toPandas() # Check that everything is ok

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn,company_index,company_vec,features
0,Cameron Williams,42.0,11066.80,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1,343.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[42.0, 11066.8, 7.22, 8.0]"
1,Kevin Mueller,41.0,11916.22,0,6.50,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[41.0, 11916.22, 6.5, 11.0]"
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1,515.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[38.0, 12884.75, 6.67, 12.0]"
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1,14.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[42.0, 8010.76, 6.71, 10.0]"
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1,474.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[37.0, 9191.58, 5.56, 9.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,Paul Miller,42.0,12800.82,1,3.62,8.0,2007-12-01 13:29:34,"9316 Julian Fort Suite 328 North Leslie, ME 43961",Evans-Lucero,0,240.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[42.0, 12800.82, 3.62, 8.0]"
896,Natalie Hodges,52.0,9893.92,0,6.91,7.0,2008-12-28 15:23:58,"8419 William Square Apt. 695 Martinville, RI 3...",Perry and Sons,0,11.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[52.0, 9893.92, 6.91, 7.0]"
897,Ana Smith,45.0,12056.18,0,5.46,4.0,2014-06-20 05:10:09,Unit 8633 Box 8738 DPO AA 14126-5026,Schneider-Smith,0,692.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[45.0, 12056.18, 5.46, 4.0]"
898,Justin Leonard,51.0,6517.93,1,5.47,10.0,2012-05-30 00:15:43,"49800 Torres Ways Suite 886 West Bradleybury, ...",Robles-Abbott,0,660.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[51.0, 6517.93, 5.47, 10.0]"


Now, time to create our model.

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol='Churn')

After create our model, we need to split our data for training and test.

In [None]:
final_data = output.select(['features', 'Churn'])
final_data.toPandas()

Unnamed: 0,features,Churn
0,"[42.0, 11066.8, 7.22, 8.0]",1
1,"[41.0, 11916.22, 6.5, 11.0]",1
2,"[38.0, 12884.75, 6.67, 12.0]",1
3,"[42.0, 8010.76, 6.71, 10.0]",1
4,"[37.0, 9191.58, 5.56, 9.0]",1
...,...,...
895,"[42.0, 12800.82, 3.62, 8.0]",0
896,"[52.0, 9893.92, 6.91, 7.0]",0
897,"[45.0, 12056.18, 5.46, 4.0]",0
898,"[51.0, 6517.93, 5.47, 10.0]",0


In [None]:
train_data, test_data = final_data.randomSplit([0.7, 0.3,], 45)

In [None]:
train_data.describe().show() # Check the split

+-------+-------------------+
|summary|              Churn|
+-------+-------------------+
|  count|                647|
|   mean|0.18547140649149924|
| stddev|0.38898023225502154|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [None]:
train_data.toPandas()

Unnamed: 0,features,Churn
0,"[25.0, 9672.03, 5.49, 8.0]",0
1,"[26.0, 8939.61, 4.54, 7.0]",0
2,"[27.0, 8628.8, 5.3, 7.0]",0
3,"[28.0, 8670.98, 3.99, 6.0]",0
4,"[28.0, 11128.95, 5.12, 8.0]",0
...,...,...
642,"[56.0, 12502.81, 5.47, 8.0]",0
643,"[58.0, 9703.93, 5.16, 11.0]",1
644,"[58.0, 12376.37, 6.01, 12.0]",1
645,"[60.0, 9621.04, 7.65, 8.0]",0


In [None]:
test_data.describe().show() # Check the split

+-------+-------------------+
|summary|              Churn|
+-------+-------------------+
|  count|                253|
|   mean|0.11857707509881422|
| stddev| 0.3239310120742911|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



Let's start the training

In [None]:
lrmodel = lr.fit(train_data)

And make the predictions:

In [None]:
predictions = lrmodel.transform(test_data)

In [None]:
predictions.toPandas()

Unnamed: 0,features,Churn,rawPrediction,probability,prediction
0,"[22.0, 11254.38, 4.96, 8.0]",0,"[4.450565490958583, -4.450565490958583]","[0.9884626982654338, 0.011537301734566241]",0.0
1,"[26.0, 8787.39, 5.42, 11.0]",1,"[0.4154603046325036, -0.4154603046325036]","[0.6023964319339755, 0.3976035680660245]",0.0
2,"[28.0, 9090.43, 5.74, 10.0]",0,"[1.3320506209776397, -1.3320506209776397]","[0.7911796292440814, 0.20882037075591864]",0.0
3,"[28.0, 11204.23, 3.67, 11.0]",0,"[1.301390798121389, -1.301390798121389]","[0.7860689590366721, 0.21393104096332793]",0.0
4,"[30.0, 8677.28, 7.31, 7.0]",0,"[3.948242921894394, -3.948242921894394]","[0.9810764446687602, 0.018923555331239794]",0.0
...,...,...,...,...,...
248,"[55.0, 9743.57, 5.89, 8.0]",0,"[2.3746058510638974, -2.3746058510638974]","[0.9148702626425914, 0.08512973735740859]",0.0
249,"[55.0, 10056.55, 4.98, 8.0]",0,"[2.9214138220587778, -2.9214138220587778]","[0.9488949036017311, 0.05110509639826888]",0.0
250,"[55.0, 11158.5, 4.86, 10.0]",1,"[0.5125748945773481, -0.5125748945773481]","[0.6254098956145859, 0.37459010438541407]",0.0
251,"[56.0, 10074.4, 5.17, 7.0]",0,"[3.976382001650311, -3.976382001650311]","[0.9815918484169487, 0.018408151583051313]",0.0


Once we did the predictions, we can evaluate how good is our model by comparing its predictions with the actual results

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
predictionAndLabels = predictions.select('Churn', 'prediction')

acc = evaluator.evaluate(predictionAndLabels)

In [None]:
acc

0.8230941704035875

**BONUS TRACK**

Let's do this with a pipeline

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer

In [None]:
df.toPandas()

Unnamed: 0,Names,Age,Total_Purchase,Account_Manager,Years,Num_Sites,Onboard_date,Location,Company,Churn,company_index,company_vec
0,Cameron Williams,42.0,11066.80,0,7.22,8.0,2013-08-30 07:00:40,"10265 Elizabeth Mission Barkerburgh, AK 89518",Harvey LLC,1,343.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Kevin Mueller,41.0,11916.22,0,6.50,11.0,2013-08-13 00:38:46,"6157 Frank Gardens Suite 019 Carloshaven, RI 1...",Wilson PLC,1,2.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Eric Lozano,38.0,12884.75,0,6.67,12.0,2016-06-29 06:20:07,"1331 Keith Court Alyssahaven, DE 90114","Miller, Johnson and Wallace",1,515.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Phillip White,42.0,8010.76,0,6.71,10.0,2014-04-22 12:43:12,"13120 Daniel Mount Angelabury, WY 30645-4695",Smith Inc,1,14.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Cynthia Norton,37.0,9191.58,0,5.56,9.0,2016-01-19 15:31:15,"765 Tricia Row Karenshire, MH 71730",Love-Jones,1,474.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
895,Paul Miller,42.0,12800.82,1,3.62,8.0,2007-12-01 13:29:34,"9316 Julian Fort Suite 328 North Leslie, ME 43961",Evans-Lucero,0,240.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
896,Natalie Hodges,52.0,9893.92,0,6.91,7.0,2008-12-28 15:23:58,"8419 William Square Apt. 695 Martinville, RI 3...",Perry and Sons,0,11.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
897,Ana Smith,45.0,12056.18,0,5.46,4.0,2014-06-20 05:10:09,Unit 8633 Box 8738 DPO AA 14126-5026,Schneider-Smith,0,692.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
898,Justin Leonard,51.0,6517.93,1,5.47,10.0,2012-05-30 00:15:43,"49800 Torres Ways Suite 886 West Bradleybury, ...",Robles-Abbott,0,660.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
assembler = VectorAssembler(inputCols=['Total_Purchase', 
                                       'Years', 'Num_Sites'], outputCol='features', 
                            handleInvalid='skip')
output = assembler.transform(df)

In [None]:
lr = LogisticRegression(featuresCol='features', labelCol='Churn')

In [None]:
training, test = df.randomSplit([0.7, 0.3,], 55) # Split our data

In [None]:
pipeline = Pipeline(stages=[assembler, lr]) # Create the pipeline

In [None]:
model = pipeline.fit(training) # Train the model
predicted = model.transform(test) # Test the model

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')
acc = evaluator.evaluate(predicted); acc

0.7675438596491228