In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark=SparkSession.builder.config("spark-driver.host", "localhost").config("spark.driver.memory","4g") \
.config("spark.executor.memory","4g").appName("mr").getOrCreate()

In [5]:
spark

In [6]:
#import all required libraries

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline


In [7]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Connect to HDFS") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://10.0.2.15:9000") \
    .getOrCreate()

# Test HDFS access by listing files
hdfs_path = "hdfs:10.0.2.15:9000/user/Group04/Books.csv"
df = spark.read.csv(hdfs_path, header=True, inferSchema=True)
df.show(5)

column = ['Id','User_id','review/score','Title']
df = df.select([col for col in column]).toDF(*column)
df = df.withColumn("review/score", df["review/score"].cast("float"))
df = df.na.drop("any")
df = df.dropDuplicates(subset=column)
df.show(5)


+----------+--------------+------------+--------------------+
|        Id|       User_id|review/score|               Title|
+----------+--------------+------------+--------------------+
|1592290574|A30FYZFN4GBJ67|         4.0|The ABAP Quick Re...|
|0802130240| AOF9G8IWSFIMS|         4.0|Manson in His Own...|
|1421810182| A5FR5OUCWS9VS|         2.0|             Herland|
|1421810182|A3UM8L4OJHXUAV|         3.0|             Herland|
|0072936533|A1AU5BSZ11ZV10|         4.0|The Gregg Referen...|
+----------+--------------+------------+--------------------+
only showing top 5 rows



In [8]:
Id_limit = 200
User_id_limit = 10

Id_counts = df.groupBy("Id").count().filter(f"count >= {Id_limit}")
User_id_counts = df.groupBy("User_id").count().filter(f"count >= {User_id_limit}")

In [9]:
filtered_df = df.join(Id_counts, on="Id", how="inner") \
                .join(User_id_counts, on="User_id", how="inner") \
                .drop("count")
string_indexer_id = StringIndexer(inputCol="Id", outputCol="IdIndex")
string_indexer_user = StringIndexer(inputCol="User_id", outputCol="UserIndex")


pipeline = Pipeline(stages=[string_indexer_id, string_indexer_user])
new_df = pipeline.fit(filtered_df).transform(filtered_df)

In [10]:
(train_data, test_data) = new_df.randomSplit([0.8, 0.2], seed=42)

In [11]:
#ALS Model

als = ALS(maxIter=5, regParam=0.01, userCol="UserIndex", itemCol="IdIndex", ratingCol="review/score",coldStartStrategy="drop")
model = als.fit(train_data)



In [12]:
#predictions

predictions = model.transform(test_data)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="review/score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 2.8932583764149697


In [13]:
#give user recommendations
user_recommend = model.recommendForAllUsers(4)

book_id_to_title = dict(new_df.select("IdIndex", "Title").rdd.map(lambda r: (r.IdIndex, r.Title)).collect())
user_index_to_id = dict(new_df.select("UserIndex", "User_id").rdd.map(lambda r: (r.UserIndex, r.User_id)).collect())

#print the output
count = 0
for user_recommend in user_recommend.select("UserIndex", "recommendations").collect():
    if count == 3: break
    else:
        user_id_index =user_recommend.UserIndex
        User_id = user_index_to_id[user_id_index]
        recommended_items = user_recommend.recommendations

        print(f"User: {User_id}")
        print("Relevant Items:")
        for i, rec in enumerate(recommended_items):
            Id= rec.IdIndex
            Title = book_id_to_title[Id]
            print(f"{i+1}. {Title}")
        print("--------")
    count+=1

User: A14OJS0VWMOSWO
Relevant Items:
1. Stones from the River
2. The Inheritance of Loss: A Novel (Man Booker Prize)
3. How To Cook Everything: Simple Recipes for Great Food
4. Jane Eyre (New Windmill)
--------
User: A1T17LMQABMBN5
Relevant Items:
1. Stones from the River
2. The Inheritance of Loss: A Novel (Man Booker Prize)
3. The Duke and I (Bridgerton Series, Book 1)
4. I Feel Bad About My Neck: And Other Thoughts on Being a Woman
--------
User: A1EKTLUL24HDG8
Relevant Items:
1. Great Expectations
2. The Awakening
3. Narrative of the Life of Frederick Douglass
4. The Awakening
--------


In [14]:
# Original DataFrame from recommendForAllUsers
recommendations_df = model.recommendForAllUsers(4)

# Convert recommendations to a list of dictionaries for new DataFrame creation
recommendations_data = []

for row in recommendations_df.select("UserIndex", "recommendations").collect():
    user_id_index = row.UserIndex
    User_id = user_index_to_id[user_id_index]
    recommended_items = row.recommendations

    # Extract titles
    titles = [book_id_to_title[rec.IdIndex] for rec in recommended_items]
    recommendations_data.append({"user_id": User_id, "titles": titles})

# Define schema for the new DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("titles", ArrayType(StringType()), True)
])

# Create the new DataFrame
spark = SparkSession.builder.getOrCreate()
result_df = spark.createDataFrame(recommendations_data, schema)

# Show the resulting DataFrame
result_df.show(truncate=False)


+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id       |titles                                                                                                                                                                                                                                                                                                 |
+--------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|A14OJS0VWMOSWO|[Stones from the River, The Inheritance of Lo

In [15]:
# Convert the Spark DataFrame to a Pandas DataFrame
pandas_df = result_df.toPandas()

# Display the Pandas DataFrame
pandas_df.head()


Unnamed: 0,user_id,titles
0,A14OJS0VWMOSWO,"[Stones from the River, The Inheritance of Los..."
1,A1T17LMQABMBN5,"[Stones from the River, The Inheritance of Los..."
2,A1EKTLUL24HDG8,"[Great Expectations, The Awakening, Narrative ..."
3,A17FLA8HQOFVIG,[How To Cook Everything: Simple Recipes for Gr...
4,A22RY8N8CNDF3A,"[The Female Brain, I Feel Bad About My Neck: A..."


In [16]:
import pickle
# Save as .pkl file
with open("Collab_model.pkl", "wb") as f:
    pickle.dump(pandas_df, f)


In [17]:
print(type(result_df))
print(type(pandas_df))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [None]:
spark.stop()