In [1]:
from pyspark.ml.classification import LogisticRegression

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('CustomerPrediction').getOrCreate()
chemin_fichier = 'C:/Users/khoul/OneDrive/Bureau/churn-bigml-80.csv'

In [3]:
input_data = spark.read.csv(chemin_fichier, header=True, inferSchema=True)

In [4]:
input_data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account length: integer (nullable = true)
 |-- Area code: integer (nullable = true)
 |-- International plan: string (nullable = true)
 |-- Voice mail plan: string (nullable = true)
 |-- Number vmail messages: integer (nullable = true)
 |-- Total day minutes: double (nullable = true)
 |-- Total day calls: integer (nullable = true)
 |-- Total day charge: double (nullable = true)
 |-- Total eve minutes: double (nullable = true)
 |-- Total eve calls: integer (nullable = true)
 |-- Total eve charge: double (nullable = true)
 |-- Total night minutes: double (nullable = true)
 |-- Total night calls: integer (nullable = true)
 |-- Total night charge: double (nullable = true)
 |-- Total intl minutes: double (nullable = true)
 |-- Total intl calls: integer (nullable = true)
 |-- Total intl charge: double (nullable = true)
 |-- Customer service calls: integer (nullable = true)
 |-- Churn: boolean (nullable = true)



In [5]:
input_data.count()

2666

In [6]:
from pyspark.sql.functions import when
input_data = input_data.withColumn('churn_integer', when(input_data['churn'] == True, 1).otherwise(0))

In [7]:
from pyspark.ml.feature import VectorAssembler

In [8]:
input_data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account length: integer (nullable = true)
 |-- Area code: integer (nullable = true)
 |-- International plan: string (nullable = true)
 |-- Voice mail plan: string (nullable = true)
 |-- Number vmail messages: integer (nullable = true)
 |-- Total day minutes: double (nullable = true)
 |-- Total day calls: integer (nullable = true)
 |-- Total day charge: double (nullable = true)
 |-- Total eve minutes: double (nullable = true)
 |-- Total eve calls: integer (nullable = true)
 |-- Total eve charge: double (nullable = true)
 |-- Total night minutes: double (nullable = true)
 |-- Total night calls: integer (nullable = true)
 |-- Total night charge: double (nullable = true)
 |-- Total intl minutes: double (nullable = true)
 |-- Total intl calls: integer (nullable = true)
 |-- Total intl charge: double (nullable = true)
 |-- Customer service calls: integer (nullable = true)
 |-- Churn: boolean (nullable = true)
 |-- churn_integer: integer (nullabl

In [9]:
assembler=VectorAssembler(inputCols=['Account length', 'Area code', 'Total eve calls', 'Total day minutes', 'Total day calls', 'Total day charge','Customer service calls'], outputCol='features')

In [10]:
output_data=assembler.transform(input_data)

In [11]:
output_data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Account length: integer (nullable = true)
 |-- Area code: integer (nullable = true)
 |-- International plan: string (nullable = true)
 |-- Voice mail plan: string (nullable = true)
 |-- Number vmail messages: integer (nullable = true)
 |-- Total day minutes: double (nullable = true)
 |-- Total day calls: integer (nullable = true)
 |-- Total day charge: double (nullable = true)
 |-- Total eve minutes: double (nullable = true)
 |-- Total eve calls: integer (nullable = true)
 |-- Total eve charge: double (nullable = true)
 |-- Total night minutes: double (nullable = true)
 |-- Total night calls: integer (nullable = true)
 |-- Total night charge: double (nullable = true)
 |-- Total intl minutes: double (nullable = true)
 |-- Total intl calls: integer (nullable = true)
 |-- Total intl charge: double (nullable = true)
 |-- Customer service calls: integer (nullable = true)
 |-- Churn: boolean (nullable = true)
 |-- churn_integer: integer (nullabl

In [12]:
output_data.head(1)

[Row(State='KS', Account length=128, Area code=415, International plan='No', Voice mail plan='Yes', Number vmail messages=25, Total day minutes=265.1, Total day calls=110, Total day charge=45.07, Total eve minutes=197.4, Total eve calls=99, Total eve charge=16.78, Total night minutes=244.7, Total night calls=91, Total night charge=11.01, Total intl minutes=10.0, Total intl calls=3, Total intl charge=2.7, Customer service calls=1, Churn=False, churn_integer=0, features=DenseVector([128.0, 415.0, 99.0, 265.1, 110.0, 45.07, 1.0]))]

In [13]:
final_data=output_data.select('features','churn_integer')

In [14]:
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- churn_integer: integer (nullable = false)



In [15]:
train,test=final_data.randomSplit([0.7,0.3])

In [16]:
model=LogisticRegression(labelCol='churn_integer')

In [17]:
model=model.fit(train)


In [18]:
summary=model.summary

In [19]:
summary.predictions.describe().show()

+-------+-------------------+--------------------+
|summary|      churn_integer|          prediction|
+-------+-------------------+--------------------+
|  count|               1865|                1865|
|   mean|0.14691689008042896|0.011260053619302948|
| stddev| 0.3541179970818712| 0.10554258663376014|
|    min|                0.0|                 0.0|
|    max|                1.0|                 1.0|
+-------+-------------------+--------------------+



In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
prediction=model.evaluate(test)

In [22]:
prediction.predictions.show()

+--------------------+-------------+--------------------+--------------------+----------+
|            features|churn_integer|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|[1.0,408.0,77.0,1...|            0|[3.31443568483195...|[0.96492073303467...|       0.0|
|[1.0,415.0,110.0,...|            0|[2.74955210668282...|[0.93988804950883...|       0.0|
|[3.0,415.0,129.0,...|            0|[1.50969749360530...|[0.81901637108929...|       0.0|
|[5.0,415.0,12.0,1...|            0|[1.29604187474552...|[0.78516808340377...|       0.0|
|[6.0,408.0,122.0,...|            0|[0.92977521396829...|[0.71702967905520...|       0.0|
|[7.0,415.0,83.0,2...|            0|[1.16319182087698...|[0.76191220147497...|       0.0|
|[8.0,415.0,59.0,2...|            0|[1.71346713555751...|[0.84728544705659...|       0.0|
|[11.0,408.0,111.0...|            0|[2.32171431199905...|[0.91065951319684...|       0.0|
|[13.0,415

In [23]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='churn_integer')

In [24]:
evaluator.evaluate(prediction.predictions)

0.5102658392706658

In [25]:
model=LogisticRegression(labelCol='churn_integer')
model=model.fit(final_data)

In [26]:
test_data=assembler.transform(input_data)

In [27]:
result=model.transform(test_data)

In [28]:
result.show()

+-----+--------------+---------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+---------------+----------------+-------------------+-----------------+------------------+------------------+----------------+-----------------+----------------------+-----+-------------+--------------------+--------------------+--------------------+----------+
|State|Account length|Area code|International plan|Voice mail plan|Number vmail messages|Total day minutes|Total day calls|Total day charge|Total eve minutes|Total eve calls|Total eve charge|Total night minutes|Total night calls|Total night charge|Total intl minutes|Total intl calls|Total intl charge|Customer service calls|Churn|churn_integer|            features|       rawPrediction|         probability|prediction|
+-----+--------------+---------+------------------+---------------+---------------------+-----------------+---------------+----------------+-----------------+--

In [29]:
result.select('State','prediction').show()

+-----+----------+
|State|prediction|
+-----+----------+
|   KS|       0.0|
|   OH|       0.0|
|   NJ|       0.0|
|   OH|       0.0|
|   OK|       0.0|
|   AL|       0.0|
|   MA|       0.0|
|   MO|       0.0|
|   WV|       0.0|
|   RI|       0.0|
|   IA|       0.0|
|   MT|       0.0|
|   IA|       0.0|
|   ID|       0.0|
|   VT|       0.0|
|   VA|       0.0|
|   TX|       0.0|
|   FL|       0.0|
|   CO|       0.0|
|   AZ|       0.0|
+-----+----------+
only showing top 20 rows



In [30]:
# Save the model

model.write().overwrite().save("modelnew.pkl")  # Replace "model99" with your desired filename
