<a href="https://colab.research.google.com/github/laasyakommaraju/r-sutdio/blob/main/Bda_A_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## 1.Build a classification model with spark with a dataset of your choice in python for big data analysis.


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Start Spark session
spark = SparkSession.builder.appName("CustomerPurchasePrediction").getOrCreate()

# Simulated customer data: (age, salary, time_spent_on_website, label)
data = [
    (25, 45000, 15, 0),  # Customer did not purchase
    (30, 55000, 20, 0),
    (45, 80000, 40, 1),  # Customer purchased
    (35, 60000, 25, 0),
    (50, 95000, 35, 1),
    (60, 120000, 50, 1),
    (40, 70000, 30, 0),
    (28, 48000, 18, 0),
    (33, 54000, 28, 1),
    (38, 75000, 45, 1)
]

# Define schema
columns = ["age", "salary", "time_spent_on_website", "label"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Assemble features
assembler = VectorAssembler(
    inputCols=["age", "salary", "time_spent_on_website"],
    outputCol="features"
)
df_prepared = assembler.transform(df).select("features", "label")

# Split the dataset
train_data, test_data = df_prepared.randomSplit([0.7, 0.3], seed=42)

# Decision Tree model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
model = dt.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.select("features", "label", "prediction").show()

# Evaluate model
evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(predictions)
print(f"Test AUC: {auc:.2f}")

# Stop Spark session
spark.stop()


+-------------------+-----+----------+
|           features|label|prediction|
+-------------------+-----+----------+
|[35.0,60000.0,25.0]|    0|       1.0|
|[28.0,48000.0,18.0]|    0|       0.0|
+-------------------+-----+----------+

Test AUC: 0.00


## 2.Build a clustering model with spark with a data set of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

# Start Spark session
spark = SparkSession.builder.appName("CustomerSegmentationClustering").getOrCreate()

# New dataset: (annual_income, spending_score)
data = [
    (15.0, 39.0),
    (16.0, 81.0),
    (17.0, 6.0),
    (18.0, 77.0),
    (19.0, 40.0),
    (20.0, 76.0),
    (21.0, 6.0),
    (22.0, 94.0),
    (23.0, 3.0),
    (24.0, 72.0),
    (25.0, 13.0),
    (26.0, 70.0),
    (27.0, 14.0),
    (28.0, 99.0),
    (29.0, 15.0)
]

columns = ["annual_income", "spending_score"]
df = spark.createDataFrame(data, schema=columns)

# Feature assembler
assembler = VectorAssembler(inputCols=columns, outputCol="features")
df_features = assembler.transform(df).select("features")

# KMeans clustering
kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(df_features)
predictions = model.transform(df_features)

# Show clustering results
predictions.show(truncate=False)

# Print cluster centers
print("Cluster Centers:")
for center in model.clusterCenters():
    print(center)

# Stop Spark session
spark.stop()


+-----------+----------+
|features   |prediction|
+-----------+----------+
|[15.0,39.0]|2         |
|[16.0,81.0]|0         |
|[17.0,6.0] |1         |
|[18.0,77.0]|0         |
|[19.0,40.0]|2         |
|[20.0,76.0]|0         |
|[21.0,6.0] |1         |
|[22.0,94.0]|0         |
|[23.0,3.0] |1         |
|[24.0,72.0]|0         |
|[25.0,13.0]|1         |
|[26.0,70.0]|0         |
|[27.0,14.0]|1         |
|[28.0,99.0]|0         |
|[29.0,15.0]|1         |
+-----------+----------+

Cluster Centers:
[22.         81.28571429]
[23.66666667  9.5       ]
[17.  39.5]


## 3.Build a recommondation engine with spark with a dataset of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Start Spark session
spark = SparkSession.builder.appName("BookRecommendation").getOrCreate()

# New dataset: user_id, book_id, rating
data = [
    (1, 101, 5.0),
    (1, 102, 4.0),
    (1, 103, 2.5),
    (2, 101, 3.0),
    (2, 104, 5.0),
    (2, 105, 4.5),
    (3, 102, 3.5),
    (3, 103, 4.5),
    (3, 106, 5.0),
    (4, 104, 2.5),
    (4, 105, 3.5),
    (4, 106, 4.0),
    (5, 101, 4.5),
    (5, 103, 4.0),
    (5, 105, 2.0),
]

columns = ["user_id", "book_id", "rating"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Split data into training and test sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Build ALS (Alternating Least Squares) model
als = ALS(
    userCol="user_id",
    itemCol="book_id",
    ratingCol="rating",
    coldStartStrategy="drop",  # Drop rows with NaN predictions
    nonnegative=True,  # Ensure that all factors are non-negative
    implicitPrefs=False,  # Using explicit ratings
    rank=10,  # Number of latent factors
    maxIter=10,  # Number of iterations
    regParam=0.1  # Regularization parameter to prevent overfitting
)

# Train the ALS model
model = als.fit(train_data)

# Predict ratings on test data
predictions = model.transform(test_data)
predictions.show()

# Evaluate the model using RMSE (Root Mean Squared Error)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Test RMSE: {rmse:.2f}")

# Recommend top 3 books for each user
user_recs = model.recommendForAllUsers(3)
user_recs.show(truncate=False)

# Stop Spark session
spark.stop()


+-------+-------+------+----------+
|user_id|book_id|rating|prediction|
+-------+-------+------+----------+
|      3|    102|   3.5| 2.0663626|
|      5|    101|   4.5|  1.287073|
+-------+-------+------+----------+

Test RMSE: 2.49
+-------+------------------------------------------------------+
|user_id|recommendations                                       |
+-------+------------------------------------------------------+
|1      |[{101, 4.8597326}, {102, 4.0014143}, {106, 3.0055833}]|
|2      |[{104, 4.8236704}, {105, 4.45436}, {106, 4.2804103}]  |
|3      |[{106, 4.9407706}, {105, 4.1730957}, {104, 3.358101}] |
|4      |[{106, 3.9472227}, {105, 3.3039286}, {104, 2.6128924}]|
|5      |[{106, 2.1266475}, {105, 1.9698161}, {104, 1.841012}] |
+-------+------------------------------------------------------+

