In [1]:
import findspark
findspark.init("/Users/jean/spark-2.4.4-bin-hadoop2.7")
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('churn').getOrCreate()

In [2]:
df = spark.read.csv("customer_churn.csv", inferSchema=True, header=True)

In [3]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [4]:
df.columns

['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [5]:
important_cols = df.select(['Age', 'Total_Purchase', 'Years', 'Num_Sites', 'Company', 'Churn'])

In [6]:
final_data = important_cols.na.drop()

In [7]:
from pyspark.ml.feature import (VectorAssembler, VectorIndexer, OneHotEncoder, StringIndexer)

In [8]:
company_indexer = StringIndexer(inputCol="Company", outputCol="Company_index")
company_enconder = OneHotEncoder(inputCol="Company_index", outputCol="Company_vect")

In [9]:
assembler = (VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Years', 'Num_Sites'],
                             outputCol="features"))

In [10]:
from pyspark.ml.classification import LogisticRegression

In [11]:
log = LogisticRegression(featuresCol="features", labelCol="Churn")

In [12]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [13]:
test_data

DataFrame[Age: double, Total_Purchase: double, Years: double, Num_Sites: double, Company: string, Churn: int]

In [14]:
from pyspark.ml.pipeline import Pipeline

In [15]:
pipeline = Pipeline(stages=[assembler, log])

In [16]:
fit_model = pipeline.fit(train_data)

In [17]:
results = fit_model.transform(test_data)

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [19]:
my_eval = BinaryClassificationEvaluator(labelCol="Churn", rawPredictionCol="prediction")

In [20]:
my_eval.evaluate(results)

0.7515299877600978

In [21]:
results.show()

+----+--------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Years|Num_Sites|             Company|Churn|            features|       rawPrediction|         probability|prediction|
+----+--------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|29.0|       9378.24| 4.93|      8.0|White, Jones and ...|    0|[29.0,9378.24,4.9...|[4.30306597283259...|[0.98665351628998...|       0.0|
|29.0|       9617.59| 5.49|      8.0|         Mendoza Inc|    0|[29.0,9617.59,5.4...|[3.98183238312348...|[0.98169007487498...|       0.0|
|29.0|      11274.46| 4.43|      8.0|Crawford, Scott a...|    0|[29.0,11274.46,4....|[4.59204965622425...|[0.98996955930937...|       0.0|
|29.0|      12711.15| 5.74|      7.0|       Velasquez PLC|    0|[29.0,12711.15,5....|[4.99879822080133...|[0.99329915484110...|       0.0|
|30.0|      11575.37| 5.22|

In [22]:
results.show()

+----+--------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Years|Num_Sites|             Company|Churn|            features|       rawPrediction|         probability|prediction|
+----+--------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|29.0|       9378.24| 4.93|      8.0|White, Jones and ...|    0|[29.0,9378.24,4.9...|[4.30306597283259...|[0.98665351628998...|       0.0|
|29.0|       9617.59| 5.49|      8.0|         Mendoza Inc|    0|[29.0,9617.59,5.4...|[3.98183238312348...|[0.98169007487498...|       0.0|
|29.0|      11274.46| 4.43|      8.0|Crawford, Scott a...|    0|[29.0,11274.46,4....|[4.59204965622425...|[0.98996955930937...|       0.0|
|29.0|      12711.15| 5.74|      7.0|       Velasquez PLC|    0|[29.0,12711.15,5....|[4.99879822080133...|[0.99329915484110...|       0.0|
|30.0|      11575.37| 5.22|

In [23]:
df_new_customers = spark.read.csv('new_customers.csv', inferSchema=True, header=True)

In [25]:
df_new_customers.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)



In [27]:
new_results = fit_model.transform(df_new_customers)

In [31]:
new_results.select(['Names', 'Age', 'probability', 'prediction']).show()

+--------------+----+--------------------+----------+
|         Names| Age|         probability|prediction|
+--------------+----+--------------------+----------+
| Andrew Mccall|37.0|[0.90739050465566...|       0.0|
|Michele Wright|23.0|[0.00252743153376...|       1.0|
|  Jeremy Chang|65.0|[0.03028237632827...|       1.0|
|Megan Ferguson|32.0|[0.00463530789584...|       1.0|
|  Taylor Young|32.0|[0.77501059334140...|       0.0|
| Jessica Drake|22.0|[0.19380248483589...|       1.0|
+--------------+----+--------------------+----------+

