In [1]:
pip install pandas numpy scikit-learn surprise transformers


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357284 sha256=5e21473c8233a728c7314e12d57e85cd70ca286a0d8ba64fd7b4f46c32bf2393
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from transformers import pipeline

In [3]:
# Load the MovieLens 20M dataset
movies = pd.read_csv('/content/movie.csv')
ratings = pd.read_csv('/content/rating.csv')
print("Movies:")
print(movies.head())
print("\nRatings:")
print(ratings.head())


Movies:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings:
   userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40


In [4]:
# Popularity Filtering
def popularity_recommendations(n=10):
    popular_movies = ratings.groupby('movieId').size().reset_index(name='count')
    popular_movies = popular_movies.merge(movies, on='movieId')
    popular_movies = popular_movies.sort_values('count', ascending=False).head(n)
    return popular_movies[['title', 'count']]

In [5]:
#Content Based Filtering
def content_based_recommendations(title, n=10):
    # Check if the title exists in the DataFrame
    if title not in movies['title'].values:
        print(f"Movie '{title}' not found in the dataset.")
        return pd.DataFrame()  # Return an empty DataFrame

    tfidf = TfidfVectorizer(stop_words='english')
    movies['genres'] = movies['genres'].str.replace('|', ' ')
    tfidf_matrix = tfidf.fit_transform(movies['genres'])

    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    idx = movies.index[movies['title'].str.lower() == title.lower()][0]  # Case insensitive search
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]

    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][['title', 'genres']]


In [6]:
# Collaborative Filtering using SVD
def collaborative_recommendations(user_id, n=10):
    reader = Reader(rating_scale=(0.5, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    trainset, testset = train_test_split(data, test_size=0.2)

    algo = SVD()
    algo.fit(trainset)

    user_movies = ratings[ratings['userId'] == user_id]['movieId']
    all_movies = ratings['movieId'].unique()
    movies_to_predict = [movie for movie in all_movies if movie not in user_movies.values]

    predictions = []
    for movie in movies_to_predict:
        predictions.append((movie, algo.predict(user_id, movie).est))

    predictions.sort(key=lambda x: x[1], reverse=True)
    top_predictions = predictions[:n]

    return movies[movies['movieId'].isin([x[0] for x in top_predictions])][['title']]

In [7]:
# LLMs for Recommendation
def llm_recommendations(prompt, n=10):
    nlp = pipeline('text-generation', model='gpt2')  # or any other LLM model you prefer
    response = nlp(prompt, max_length=50, num_return_sequences=1)
    return response[0]['generated_text']

In [8]:
print("Popularity Recommendations:")
print(popularity_recommendations())

Popularity Recommendations:
                                          title  count
293                         Pulp Fiction (1994)  67310
352                         Forrest Gump (1994)  66172
315            Shawshank Redemption, The (1994)  63366
587            Silence of the Lambs, The (1991)  63299
476                        Jurassic Park (1993)  59715
257   Star Wars: Episode IV - A New Hope (1977)  54502
108                           Braveheart (1995)  53769
583           Terminator 2: Judgment Day (1991)  52244
2486                         Matrix, The (1999)  51334
523                     Schindler's List (1993)  50054


In [10]:

print("\nContent-Based Recommendations for 'Toy Story (1995)':")
print(content_based_recommendations('Toy Story (1995)'))


Content-Based Recommendations for 'Toy Story (1995)':
                                                   title  \
2209                                         Antz (1998)   
3027                                  Toy Story 2 (1999)   
3663      Adventures of Rocky and Bullwinkle, The (2000)   
3922                    Emperor's New Groove, The (2000)   
4790                               Monsters, Inc. (2001)   
10114  DuckTales: The Movie - Treasure of the Lost La...   
10987                                   Wild, The (2006)   
11871                             Shrek the Third (2007)   
13337                     Tale of Despereaux, The (2008)   
18274  Asterix and the Vikings (Astérix et les Viking...   

                                            genres  
2209   Adventure Animation Children Comedy Fantasy  
3027   Adventure Animation Children Comedy Fantasy  
3663   Adventure Animation Children Comedy Fantasy  
3922   Adventure Animation Children Comedy Fantasy  
4790   Adventure An

In [11]:
print("\nCollaborative Recommendations for User 1:")
print(collaborative_recommendations(1))


Collaborative Recommendations for User 1:
                                                   title
10286                                    Serenity (2005)
13684                                   Star Trek (2009)
13748                       Our Folks (Sami swoi) (1967)
13996  Heimat - A Chronicle of Germany (Heimat - Eine...
15347                                    Betrayal (1983)
17018                                 Connections (1978)
20540                               Frozen Planet (2011)
22919                                Interstellar (2014)
24693                          The Imitation Game (2014)
24801          Doctor Who: The Time of the Doctor (2013)


In [12]:
print("\nLLM Recommendations for prompt 'Recommend a movie based on a crime theme':")
print(llm_recommendations("Recommend a movie based on a crime theme"))


LLM Recommendations for prompt 'Recommend a movie based on a crime theme':


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Recommend a movie based on a crime theme instead. A movie about the death penalty, a crime theme. No reason to change the title to something else; this won't offend anyone, most of the book will suffice.


But this is the


In [8]:
print(movies['title'].unique()[:10])  # Print first 10 titles for verification


['Toy Story (1995)' 'Jumanji (1995)' 'Grumpier Old Men (1995)'
 'Waiting to Exhale (1995)' 'Father of the Bride Part II (1995)'
 'Heat (1995)' 'Sabrina (1995)' 'Tom and Huck (1995)'
 'Sudden Death (1995)' 'GoldenEye (1995)']


In [9]:
user_id = 1  # Example user ID
true_positive_movies = ['Toy Story (1995)', 'Jumanji (1995)', 'Heat (1995)']  # Movies this user liked


In [10]:
# Generate Recommendations
pop_recs = popularity_recommendations()
content_recs = content_based_recommendations('Toy Story (1995)')
collab_recs = collaborative_recommendations(user_id)


In [11]:
# Flatten the recommendations to a list of titles for evaluation
pop_recommendations = pop_recs['title'].tolist()
content_recommendations = content_recs['title'].tolist()
collab_recommendations = collab_recs['title'].tolist()

In [12]:
# Combine all recommendations for evaluation
all_recommendations = {
    "Popularity": pop_recommendations,
    "Content-Based": content_recommendations,
    "Collaborative": collab_recommendations
}

In [13]:
def evaluate_recommendations(true_movies, recommendations):
    # Create a binary array for true positives
    y_true = [1 if movie in true_movies else 0 for movie in recommendations]
    precision = precision_score([1] * len(true_movies) + [0] * (len(recommendations) - len(true_movies)), y_true, zero_division=0)
    recall = recall_score([1] * len(true_movies) + [0] * (len(recommendations) - len(true_movies)), y_true, zero_division=0)
    return precision, recall

In [15]:
from sklearn.metrics import precision_score, recall_score

In [16]:
# Print Evaluation Results
for method, recs in all_recommendations.items():
    precision, recall = evaluate_recommendations(true_positive_movies, recs)
    print(f"{method} Recommendations:")
    print(f"Precision: {precision:.2f}, Recall: {recall:.2f}")
    print(f"Recommended Movies: {recs}\n")

Popularity Recommendations:
Precision: 0.00, Recall: 0.00
Recommended Movies: ['Pulp Fiction (1994)', 'Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)', 'Jurassic Park (1993)', 'Star Wars: Episode IV - A New Hope (1977)', 'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)', "Schindler's List (1993)"]

Content-Based Recommendations:
Precision: 0.00, Recall: 0.00
Recommended Movies: ['Antz (1998)', 'Toy Story 2 (1999)', 'Adventures of Rocky and Bullwinkle, The (2000)', "Emperor's New Groove, The (2000)", 'Monsters, Inc. (2001)', 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)', 'Wild, The (2006)', 'Shrek the Third (2007)', 'Tale of Despereaux, The (2008)', 'Asterix and the Vikings (Astérix et les Vikings) (2006)']

Collaborative Recommendations:
Precision: 0.00, Recall: 0.00
Recommended Movies: ['Children of Heaven, The (Bacheha-Ye Aseman) (1997)', 'Pride and Prejudice (1995)', 'Best of Youth, The (La meglio 

In [17]:
print("True Positive Movies:")
print(true_positive_movies)

print("\nRecommended Movies:")
print(pop_recommendations)
print(content_recommendations)
print(collab_recommendations)


True Positive Movies:
['Toy Story (1995)', 'Jumanji (1995)', 'Heat (1995)']

Recommended Movies:
['Pulp Fiction (1994)', 'Forrest Gump (1994)', 'Shawshank Redemption, The (1994)', 'Silence of the Lambs, The (1991)', 'Jurassic Park (1993)', 'Star Wars: Episode IV - A New Hope (1977)', 'Braveheart (1995)', 'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)', "Schindler's List (1993)"]
['Antz (1998)', 'Toy Story 2 (1999)', 'Adventures of Rocky and Bullwinkle, The (2000)', "Emperor's New Groove, The (2000)", 'Monsters, Inc. (2001)', 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)', 'Wild, The (2006)', 'Shrek the Third (2007)', 'Tale of Despereaux, The (2008)', 'Asterix and the Vikings (Astérix et les Vikings) (2006)']
['Children of Heaven, The (Bacheha-Ye Aseman) (1997)', 'Pride and Prejudice (1995)', 'Best of Youth, The (La meglio gioventù) (2003)', 'Bittersweet Life, A (Dalkomhan insaeng) (2005)', 'It Might Get Loud (2008)', 'Avengers, The (2012)', 'North & South (2004)', 