<a href="https://colab.research.google.com/github/harshitha-produturi/Campuseats1/blob/main/BDA_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## 1.Build a classification model with spark with a dataset of your choice in python for big data analysis.


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Start Spark session
spark = SparkSession.builder.appName("MedicalDiagnosisClassification").getOrCreate()

# Simulated patient data: (age, blood_pressure, cholesterol_level, label)
data = [
    (45, 130, 220, 1),
    (50, 140, 230, 1),
    (30, 120, 180, 0),
    (60, 160, 250, 1),
    (35, 110, 190, 0),
    (40, 125, 200, 0),
    (55, 145, 240, 1),
    (38, 118, 195, 0),
    (48, 135, 210, 1),
    (33, 112, 185, 0)
]

# Define schema
columns = ["age", "blood_pressure", "cholesterol_level", "label"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Assemble features
assembler = VectorAssembler(
    inputCols=["age", "blood_pressure", "cholesterol_level"],
    outputCol="features"
)
df_prepared = assembler.transform(df).select("features", "label")

# Split the dataset
train_data, test_data = df_prepared.randomSplit([0.7, 0.3], seed=42)

# Decision Tree model
dt = DecisionTreeClassifier(featuresCol="features", labelCol="label")
model = dt.fit(train_data)

# Make predictions
predictions = model.transform(test_data)
predictions.select("features", "label", "prediction").show()

# Evaluate model
evaluator = BinaryClassificationEvaluator()
auc = evaluator.evaluate(predictions)
print(f"Test AUC: {auc:.2f}")

# Stop Spark session
spark.stop()


+------------------+-----+----------+
|          features|label|prediction|
+------------------+-----+----------+
|[45.0,130.0,220.0]|    1|       1.0|
|[33.0,112.0,185.0]|    0|       0.0|
+------------------+-----+----------+

Test AUC: 1.00


## 2.Build a clustering model with spark with a data set of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

# Start Spark session
spark = SparkSession.builder.appName("TouristLocationClustering").getOrCreate()

# New dataset: (distance_to_city_center, popularity_score, number_of_reviews)
data = [
    (2.5, 8.0, 15.0),
    (0.5, 9.5, 30.0),
    (10.0, 3.0, 5.0),
    (12.0, 2.5, 2.0),
    (1.0, 9.0, 25.0),
    (8.5, 4.0, 6.0),
    (0.8, 8.7, 28.0)
]

columns = ["distance_to_city_center", "popularity_score", "number_of_reviews"]
df = spark.createDataFrame(data, schema=columns)

# Feature assembler
assembler = VectorAssembler(inputCols=columns, outputCol="features")
df_features = assembler.transform(df).select("features")

# KMeans clustering
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df_features)
predictions = model.transform(df_features)

# Show clustering results
predictions.show(truncate=False)

print("Cluster Centers:")
for center in model.clusterCenters():
    print(center)

# Stop Spark session
spark.stop()


+--------------+----------+
|features      |prediction|
+--------------+----------+
|[2.5,8.0,15.0]|0         |
|[0.5,9.5,30.0]|1         |
|[10.0,3.0,5.0]|0         |
|[12.0,2.5,2.0]|0         |
|[1.0,9.0,25.0]|1         |
|[8.5,4.0,6.0] |0         |
|[0.8,8.7,28.0]|1         |
+--------------+----------+

Cluster Centers:
[8.25  4.375 7.   ]
[ 0.76666667  9.06666667 27.66666667]


## 3.Build a recommondation engine with spark with a dataset of your choice

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Start Spark session
spark = SparkSession.builder.appName("MusicRecommendation").getOrCreate()

# New dataset: user_id, song_id, rating
data = [
    (1, 301, 5.0),
    (1, 302, 3.5),
    (1, 303, 4.0),
    (2, 301, 2.0),
    (2, 304, 4.5),
    (2, 305, 3.0),
    (3, 302, 4.0),
    (3, 303, 2.5),
    (3, 306, 5.0),
    (4, 304, 4.0),
    (4, 305, 4.5),
    (4, 306, 5.0),
    (5, 301, 4.5),
    (5, 303, 4.0),
    (5, 305, 3.0),
]

columns = ["user_id", "song_id", "rating"]

# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

# Split data
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

# Build ALS model
als = ALS(
    userCol="user_id",
    itemCol="song_id",
    ratingCol="rating",
    coldStartStrategy="drop",
    nonnegative=True,
    implicitPrefs=False,
    rank=10,
    maxIter=10,
    regParam=0.1
)

# Train the model
model = als.fit(train_data)

# Predict ratings
predictions = model.transform(test_data)
predictions.show()

# Evaluate RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Test RMSE: {rmse:.2f}")

# Recommend top 3 songs for each user
user_recs = model.recommendForAllUsers(3)
user_recs.show(truncate=False)

# Stop Spark session
spark.stop()


+-------+-------+------+----------+
|user_id|song_id|rating|prediction|
+-------+-------+------+----------+
|      3|    302|   4.0| 1.1997145|
|      5|    301|   4.5| 1.2111512|
+-------+-------+------+----------+

Test RMSE: 3.05
+-------+------------------------------------------------------+
|user_id|recommendations                                       |
+-------+------------------------------------------------------+
|1      |[{301, 4.856713}, {302, 3.4991}, {304, 2.9036098}]    |
|2      |[{304, 4.2876225}, {306, 3.2775788}, {305, 3.0627122}]|
|3      |[{306, 4.945777}, {305, 4.221437}, {304, 3.9598732}]  |
|4      |[{306, 5.0002613}, {305, 4.276373}, {304, 4.032666}]  |
|5      |[{306, 3.4234986}, {305, 2.9560266}, {304, 2.9392178}]|
+-------+------------------------------------------------------+

