In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('log_reg').getOrCreate()

In [3]:
df = spark.read.csv('/FileStore/tables/customer_churn.csv', header=True, inferSchema=True)

In [4]:
df.printSchema()

In [5]:
df.head(2)

In [6]:
df.columns

In [7]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [8]:
vector_assembler = VectorAssembler(inputCols=['Age', 'Total_Purchase', 'Account_Manager', 'Years', 'Num_Sites'], outputCol='features')

In [9]:
output = vector_assembler.transform(df)

In [10]:
final_df = output.select(['features', 'Churn'])

In [11]:
train_df, test_df = final_df.randomSplit([.7, .3])

In [12]:
from pyspark.ml.classification import LogisticRegression

In [13]:
lr = LogisticRegression(labelCol='Churn')

In [14]:
model = lr.fit(train_df)

In [15]:
model_sum = model.summary

In [16]:
model_sum.predictions.describe().show()

In [17]:
test_df.columns

In [18]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

results = model.evaluate(test_df)

In [19]:
results.predictions.show()

In [20]:
lr_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Churn')

In [21]:
auc = lr_eval.evaluate(results.predictions)

In [22]:
auc

In [23]:
new_cust = spark.read.csv('/FileStore/tables/new_customers.csv', header=True, inferSchema=True)

In [24]:
final_lr = lr.fit(final_df)

In [25]:
pred_new_cust = vector_assembler.transform(new_cust)

In [26]:
pred_new_cust.printSchema()

In [27]:
pred = final_lr.transform(pred_new_cust)

In [28]:
pred.show()

In [29]:
pred.select(['Company', 'prediction']).show()