In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('log_reg_test').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/customer_churn.csv',inferSchema=True, header=True)

In [0]:
data.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [0]:
data.head()

Out[6]: Row(Names='Cameron Williams', Age=42.0, Total_Purchase=11066.8, Account_Manager=0, Years=7.22, Num_Sites=8.0, Onboard_date=datetime.datetime(2013, 8, 30, 7, 0, 40), Location='10265 Elizabeth Mission Barkerburgh, AK 89518', Company='Harvey LLC', Churn=1)

In [0]:
data.columns

Out[7]: ['Names',
 'Age',
 'Total_Purchase',
 'Account_Manager',
 'Years',
 'Num_Sites',
 'Onboard_date',
 'Location',
 'Company',
 'Churn']

In [0]:
my_cols = data.select(['Age', 'Total_Purchase','Years','Num_Sites','Company','Churn'])

In [0]:
final_data = my_cols.na.drop()

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,StringIndexer,OneHotEncoder)

In [0]:
company_indexer = StringIndexer(inputCol = 'Company',outputCol = 'CompanyIndex')
company_encoder = OneHotEncoder(inputCol= 'CompanyIndex', outputCol = 'CompanyVec')

In [0]:
assembler = VectorAssembler(inputCols = ['Age', 'Total_Purchase','Years','Num_Sites'],outputCol='features')

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

In [0]:
log_reg_churn = LogisticRegression(featuresCol='features', labelCol='Churn')

In [0]:
pipeline = Pipeline(stages = [assembler, log_reg_churn])

In [0]:
train_data, test_data  = final_data.randomSplit([0.8,0.2])

In [0]:
train_data.count()

Out[32]: 723

In [0]:
fit_model = pipeline.fit(train_data)

In [0]:
results = fit_model.transform(test_data)

In [0]:
results.show()

+----+--------------+---------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
| Age|Total_Purchase|Account_Manager|Years|Num_Sites|             Company|Churn|            features|       rawPrediction|         probability|prediction|
+----+--------------+---------------+-----+---------+--------------------+-----+--------------------+--------------------+--------------------+----------+
|26.0|       8939.61|              0| 4.54|      7.0|Brown, Johnson an...|    0|[26.0,8939.61,4.5...|[6.75366078774672...|[0.99883475825716...|       0.0|
|28.0|       8670.98|              0| 3.99|      6.0|      Johnson-Conley|    0|[28.0,8670.98,3.9...|[8.20174792936416...|[0.99972590123647...|       0.0|
|28.0|      11204.23|              0| 3.67|     11.0|Hernandez-Montgomery|    0|[28.0,11204.23,3....|[2.15253494711757...|[0.89590542062424...|       0.0|
|29.0|       8688.17|              1|  5.7|      9.0|         Fleming 

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
eval_model = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Churn')

In [0]:
results.select('Churn','prediction').show()

+-----+----------+
|Churn|prediction|
+-----+----------+
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    1|       0.0|
|    0|       0.0|
|    1|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    0|       0.0|
|    1|       0.0|
|    1|       1.0|
|    0|       0.0|
+-----+----------+
only showing top 20 rows



In [0]:
AUV = eval_model.evaluate(results)

In [0]:
AUV

Out[40]: 0.7668439716312058