#  Big Data Machine Learning Classification with Spark

In [None]:
pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create Spark session
spark = SparkSession.builder \
    .appName("Churn Prediction") \
    .getOrCreate()

# Load the dataset
data = spark.read.csv("churn_data.csv", header=True, inferSchema=True)

# Define features and target variable
feature_columns = ['feature1', 'feature2', 'feature3']  # Add your features here
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# Create features
data = assembler.transform(data)

# Separate target variable
final_data = data.select('features', 'label')  # Your target variable is 'label'

# Split the data into training and testing sets
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=1234)

# Create GBTClassifier model
gbt = GBTClassifier(labelCol='label', featuresCol='features', maxIter=10)

# Train the model with training data
model = gbt.fit(train_data)

# Make predictions on test data
predictions = model.transform(test_data)

# Display prediction results
predictions.select('features', 'label', 'prediction').show(5)

# Evaluate model accuracy
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy}")

# Stop Spark session
spark.stop()