**Install required libraries**

In [3]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357286 sha256=878465537eefc42f7688bc3a239711bcf2896d4bd63a339e4e422dc05a1f743f
  Stored in directory: /root/.cach

**Import Required Libraries**

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise import Reader
from google.colab import files
import zipfile
from sklearn.metrics import mean_squared_error, mean_absolute_error
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from sklearn.metrics.pairwise import cosine_similarity

**Upload and Extract Dataset**

In [40]:
uploaded = files.upload()
for file_name in uploaded.keys():
    with zipfile.ZipFile(file_name, 'r') as zip_ref:
        zip_ref.extractall("./movielens")


Saving ml-latest-small.zip to ml-latest-small (2).zip


**Load the Dataset**

In [41]:
ratings_file = './movielens/ml-latest-small/ratings.csv'
movies_file = './movielens/ml-latest-small/movies.csv'

ratings_df = pd.read_csv(ratings_file)
movies_df = pd.read_csv(movies_file)

print("Sample of Ratings Dataset:")
print(ratings_df.head())
print("Sample of Movies Dataset:")
print(movies_df.head())

Sample of Ratings Dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
Sample of Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


**Preprocessing of Data**

In [None]:
# Handling missing values
print(ratings_df.isnull().sum())
ratings_df.dropna(subset=['rating'], inplace=True)

print(movies_df.isnull().sum())
movies_df['genres'].fillna('Unknown', inplace=True)

# Handling duplicates
ratings_df.drop_duplicates(subset=['userId', 'movieId'], inplace=True)

# one-hot encoding categorical data
genres_encoded = movies_df['genres'].str.get_dummies('|')
movies_df = pd.concat([movies_df, genres_encoded], axis=1)

# Handling User or Movie ID inconsistencies
ratings_df['userId'] = ratings_df['userId'].astype(int)
ratings_df['movieId'] = ratings_df['movieId'].astype(int)

# Extracting more detailed time features
ratings_with_context['weekday'] = ratings_with_context['timestamp'].dt.weekday
ratings_with_context['month'] = ratings_with_context['timestamp'].dt.month

userId       0
movieId      0
rating       0
timestamp    5
dtype: int64
movieId    0
title      0
genres     0
dtype: int64


**Prepare the data for Surprise library**

In [61]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

**Train-Test Split for Surprise**

In [62]:
trainset, testset = surprise_train_test_split(data, test_size=0.2)

**Train the SVD Model**

In [63]:
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x78597c283af0>

**Evaluating the Model**

In [64]:
cross_validate(model, data, cv=5, verbose=True)

# Evaluate the RMSE and MAE values
predictions = model.test(testset)
rmse = np.sqrt(mean_squared_error([pred.r_ui for pred in predictions], [pred.est for pred in predictions]))
mae = mean_absolute_error([pred.r_ui for pred in predictions], [pred.est for pred in predictions])

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8733  0.8712  0.8662  0.8803  0.8765  0.8735  0.0048  
MAE (testset)     0.6727  0.6699  0.6655  0.6763  0.6735  0.6716  0.0037  
Fit time          1.08    1.76    1.04    1.04    1.12    1.21    0.28    
Test time         0.17    0.18    0.09    0.11    0.33    0.17    0.08    
RMSE: 0.6951679814170312
MAE: 0.5355930759156361


**Function to Recommend Movies for Existing Users**

In [65]:
# To generate predictions for a given user
def get_recommendations(user_id, n=10):
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    unrated_movies = movies_df[~movies_df['movieId'].isin(rated_movies)]

    predictions = []
    for movie_id in unrated_movies['movieId']:
        pred = model.predict(user_id, movie_id)
        predictions.append((movie_id, pred.est))

    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
    top_recommendations = predictions[:n]

    recommended_movies = []
    for movie_id, predicted_rating in top_recommendations:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        movie_title = movie_info['title'].values[0]
        recommended_movies.append((movie_title, predicted_rating))

    return recommended_movies

**Function for Cold-Start Handling for New Users**

In [66]:
# Handle Cold-Start for New Users
def recommend_for_cold_start(user_id, preferred_genre=None, top_n=10):
    if user_id not in ratings_df['userId'].unique():
        print(f"New user detected: User {user_id}")
        if preferred_genre:
            # Recommend the movies based on genre
            genre_movies = movies_df[movies_df['genres'].str.contains(preferred_genre, na=False)]
            top_genre_movies = genre_movies.sample(n=min(top_n, len(genre_movies)))
            return top_genre_movies[['title', 'genres']]
        else:
            # or recommend popular movies
            popular_movies = ratings_df.groupby('movieId')['rating'].mean().sort_values(ascending=False).head(top_n)
            top_movies = popular_movies.index
            return movies_df[movies_df['movieId'].isin(top_movies)][['title', 'genres']]
    else:
        return get_recommendations(user_id, n=top_n)

**Contextual Recommendations**

In [67]:
# Add the genres to ratings data
ratings_with_context = pd.merge(ratings_df, movies_df, on="movieId")

# Extract the time of day from timestamp
ratings_with_context['timestamp'] = pd.to_datetime(ratings_with_context['timestamp'], unit='s')
ratings_with_context['hour'] = ratings_with_context['timestamp'].dt.hour

# Time of day categories
def get_time_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

ratings_with_context['time_of_day'] = ratings_with_context['hour'].apply(get_time_of_day)

# Function for contextual recommendations
def get_contextual_recommendations(user_id, preferred_genre=None, time_of_day=None, n=10):
    # Filter out the unrated movies for the user
    rated_movies = ratings_df[ratings_df['userId'] == user_id]['movieId'].unique()
    unrated_movies = movies_df[~movies_df['movieId'].isin(rated_movies)]

    # Apply the genre filter
    if preferred_genre:
        unrated_movies = unrated_movies[unrated_movies['genres'].str.contains(preferred_genre, na=False)]

    # Predict the ratings for unrated movies
    predictions = []
    for movie_id in unrated_movies['movieId']:
        pred = model.predict(user_id, movie_id)
        predictions.append((movie_id, pred.est))

    # Sort the predictions by predicted ratings
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)

    # Select the top N recommendations
    top_recommendations = predictions[:n]

    # Convert the movie IDs back to titles
    recommended_movies = []
    for movie_id, predicted_rating in top_recommendations:
        movie_info = movies_df[movies_df['movieId'] == movie_id]
        movie_title = movie_info['title'].values[0]
        recommended_movies.append((movie_title, predicted_rating))

    # Function to filter recommendations further by time of day if its provided
    if time_of_day:
        recommendations_with_context = [
            (title, rating) for title, rating in recommended_movies
            if time_of_day in ratings_with_context['time_of_day'].unique()
        ]
        return recommendations_with_context

    return recommended_movies

**Real-Time Data Handling : Updating the Model with New Ratings**

In [68]:
def update_model(new_ratings):
    global ratings_df
    ratings_df = pd.concat([ratings_df, new_ratings], ignore_index=True)

    new_data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)
    trainset = new_data.build_full_trainset()
    model.fit(trainset)
    return model

# Simulating real-time user interaction
new_user_ratings = pd.DataFrame({'userId': [1001], 'movieId': [50], 'rating': [4.0]})
model = update_model(new_user_ratings)

**Adding scalability with Apache Spark**

In [69]:
spark = SparkSession.builder.appName('RecommenderSystem').getOrCreate()

ratings_spark_df = spark.read.csv("movielens/ml-latest-small/ratings.csv", header=True, inferSchema=True)

als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop")
model1 = als.fit(ratings_spark_df)

predictions = model1.transform(ratings_spark_df)
predictions.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|   463|   1088|   3.5|1145460096| 2.9916005|
|   137|   1580|   3.5|1204859475|  3.382752|
|   580|   1580|   4.0|1167792444| 3.4570491|
|   580|   3175|   2.5|1167792674|  2.604307|
|   580|  44022|   3.5|1167792560| 3.7415185|
|   133|    471|   4.0| 843491793| 3.6825655|
|   322|   1580|   3.5|1217676294| 3.3639853|
|   362|   1591|   4.0|1530638157| 3.1828601|
|   362|   1645|   5.0|1530641485| 4.3069506|
|   593|   1580|   1.5|1181007882|  2.530834|
|   597|    471|   2.0| 941558175| 4.6340265|
|   597|   1580|   3.0| 941558308| 3.5674891|
|   597|   1959|   4.0| 941640006| 4.0461583|
|   597|   2366|   5.0| 941729029|  4.740429|
|   108|   1959|   5.0|1042840682|  4.958029|
|   155|   1580|   4.0| 965939614|  3.723838|
|   155|   3175|   4.0| 961861723| 3.6331384|
|    34|   1580|   2.5|1162048827| 4.0674024|
|    34|   3997|   2.0|1162050228|

**Recommendations Function example**

In [70]:
user_id = 1  # A known user ID
recommended_movies = get_recommendations(user_id, n=5)

print(f"Top 5 movie recommendations for User {user_id}:")
for movie, rating in recommended_movies:
    print(f"{movie} (Predicted Rating: {rating:.2f})")

Top 5 movie recommendations for User 1:
Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (Predicted Rating: 5.00)
Shawshank Redemption, The (1994) (Predicted Rating: 5.00)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) (Predicted Rating: 5.00)
Godfather, The (1972) (Predicted Rating: 5.00)
Casablanca (1942) (Predicted Rating: 5.00)


**Cold-Start Example**

In [71]:
new_user_id = 999 # A new user ID
preferred_genre = 'Comedy'
print("Cold-Start Recommendations:")
print(recommend_for_cold_start(new_user_id, preferred_genre))

Cold-Start Recommendations:
New user detected: User 999
                                title                           genres
6511  Valet, The (La doublure) (2006)                           Comedy
2688            Anchors Aweigh (1945)                   Comedy|Musical
8123               Croods, The (2013)       Adventure|Animation|Comedy
7477      Love and Other Drugs (2010)             Comedy|Drama|Romance
7633             Your Highness (2011)  Action|Adventure|Comedy|Fantasy
1960                  Election (1999)                           Comedy
3053         Born in East L.A. (1987)                           Comedy
6734                Religulous (2008)               Comedy|Documentary
7389         Ramona and Beezus (2010)                  Children|Comedy
6256     Stranger than Fiction (2006)     Comedy|Drama|Fantasy|Romance


**Contextual Recommendations Example**

In [72]:
# Contextual Recommendations for User 1
user_id = 1
preferred_genre = 'Comedy'
time_of_day = 'evening'
recommendations = get_contextual_recommendations(user_id, preferred_genre, time_of_day)

print("Contextual Recommendations:")
for movie, rating in recommendations:
    print(f"{movie} (Predicted Rating: {rating:.2f})")

Contextual Recommendations:
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964) (Predicted Rating: 5.00)
Superbad (2007) (Predicted Rating: 5.00)
Girl Who Leapt Through Time, The (Toki o kakeru shôjo) (2006) (Predicted Rating: 4.97)
Top Secret! (1984) (Predicted Rating: 4.97)
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) (Predicted Rating: 4.97)
Life Is Beautiful (La Vita è bella) (1997) (Predicted Rating: 4.95)
Raising Arizona (1987) (Predicted Rating: 4.94)
Graduate, The (1967) (Predicted Rating: 4.94)
Lost in Translation (2003) (Predicted Rating: 4.93)
Roman Holiday (1953) (Predicted Rating: 4.92)
