#

SVM

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when

import os
os.environ['PYSPARK_PYTHON'] = r'C:\Users\asus\anaconda3\python.exe'
os.environ['JAVA_HOME'] = r'C:\Program Files\Java\jdk1.8.0_321'  # Update this to your Java installation path
os.environ['SPARK_HOME'] = r'C:\Users\asus\Documents\BDT\BigData\BigData\spark-3.1.2-bin-hadoop3.2' 

# Create a Spark session
spark = SparkSession.builder.appName("SVM Classifier").getOrCreate()

# Load the dataset
data = spark.read.csv(r"C:\Users\asus\Downloads\Ecommerce_Customers.csv", header=True, inferSchema=True)

# Create a binary label column ('Churn') based on 'Yearly Amount Spent'
data = data.withColumn("Churn", when(data["Yearly Amount Spent"] < 500, 1).otherwise(0))

# Assemble features
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
    outputCol='features'
)

# Prepare the features and label (target column for classification)
final_data = assembler.transform(data).select('features', 'Churn')

# Split the data into training and test sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

# Initialize Linear SVM model
svm = LinearSVC(labelCol='Churn', featuresCol='features')

# Fit the model
svm_model = svm.fit(train_data)

# Make predictions
pred_data = svm_model.transform(test_data)

# Evaluate the accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='Churn', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(pred_data)
print("SVM Model Accuracy:", accuracy)

# To see predictions
pred_data.select("features", "Churn", "prediction").show(5)

# Stop Spark session
spark.stop()


SVM Model Accuracy: 0.8636363636363636
+--------------------+-----+----------+
|            features|Churn|prediction|
+--------------------+-----+----------+
|[30.4925366965402...|    1|       1.0|
|[30.8162006488763...|    1|       1.0|
|[30.9716756438877...|    1|       0.0|
|[31.1695067987115...|    1|       1.0|
|[31.2834474760581...|    0|       0.0|
+--------------------+-----+----------+
only showing top 5 rows



# Decision Tree Classifier


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when

# Create a Spark session
spark = SparkSession.builder.appName("Decision Tree Classifier").getOrCreate()

# Load the dataset
data = spark.read.csv(r"C:\Users\asus\Downloads\Ecommerce_Customers.csv", header=True, inferSchema=True)

# Create a binary label column ('Churn') based on 'Yearly Amount Spent'
data = data.withColumn("Churn", when(data["Yearly Amount Spent"] < 500, 1).otherwise(0))

# Assemble features
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
    outputCol='features'
)

# Prepare the features and label (target column for classification)
final_data = assembler.transform(data).select('features', 'Churn')

# Split the data into training and test sets
train_data, test_data = final_data.randomSplit([0.7, 0.3])

# Initialize Decision Tree Classifier model
dt = DecisionTreeClassifier(labelCol='Churn', featuresCol='features')

# Fit the model
dt_model = dt.fit(train_data)

# Make predictions
pred_data = dt_model.transform(test_data)

# Evaluate the accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='Churn', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(pred_data)
print("Decision Tree Model Accuracy:", accuracy)

# To see predictions
pred_data.select("features", "Churn", "prediction").show(5)

# Stop Spark session
spark.stop()


Decision Tree Model Accuracy: 0.8482758620689655
+--------------------+-----+----------+
|            features|Churn|prediction|
+--------------------+-----+----------+
|[30.3931845423455...|    1|       1.0|
|[30.4925366965402...|    1|       1.0|
|[30.7377203726281...|    1|       1.0|
|[30.8364326747734...|    1|       1.0|
|[30.8794843441274...|    1|       0.0|
+--------------------+-----+----------+
only showing top 5 rows

