In [1]:
import pandas as pd
import numpy as np

### LLM Prompting for Recommendation

Data Link: https://grouplens.org/datasets/movielens/100k/

In [2]:
DATASET_LINK='http://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -n ml-100k.zip

--2024-06-16 15:48:23--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-06-16 15:48:24 (16.9 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip


In [4]:
overall_stats = pd.read_csv('ml-100k/u.info', header=None)
print("Details of users, items and ratings involved in the loaded movielens dataset: ",list(overall_stats[0]))

Details of users, items and ratings involved in the loaded movielens dataset:  ['943 users', '1682 items', '100000 ratings']


In [33]:
## same item id is same as movie id, item id column is renamed as movie id
column_names1 = ['user_id','movie_id','rating','timestamp']
dataset = pd.read_csv('ml-100k/u.data', sep='\t',header=None,names=column_names1)
dataset.head() 

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [34]:
len(dataset), max(dataset['movie_id']),min(dataset['movie_id'])

(100000, 1682, 1)

In [37]:
d = 'movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')
column_names2

['movie_id',
 'movie_title',
 'release_date',
 'video_release_date',
 'IMDb_URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [38]:
items_dataset = pd.read_csv('ml-100k/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
items_dataset

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
movie_dataset = items_dataset[['movie_id','movie_title']]
movie_dataset.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [40]:
len(items_dataset.groupby(by=column_names2[1:])),len(items_dataset)

(1664, 1682)

In [41]:
merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie_id')
merged_dataset.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [43]:
merged_dataset[(merged_dataset['movie_title'] == 'Chasing Amy (1997)') & (merged_dataset['user_id'] == 894)]

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
62716,894,246,4,882404137,Chasing Amy (1997)
90596,894,268,3,879896041,Chasing Amy (1997)


In [44]:
num_users = len(merged_dataset['user_id'].value_counts())
num_items = len(merged_dataset['movie_title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))

Unique number of users in the dataset: 943
Unique number of movies in the dataset: 1664


In [89]:
from pprint import pprint

def user_movie_list(user_id):
    user_data = merged_dataset[merged_dataset['user_id'] == user_id][['movie_title', 'rating']]
    sorted_user_data = user_data.sort_values(by='rating', ascending=False)
    print("Movies seen by the User ranked by rating:")
    pprint(sorted_user_data)
    print("")

In [90]:
user_movie_list(user_id=28)

Movies seen by the User ranked by rating:
                                      movie_title  rating
19984                              Contact (1997)       5
44809                               Psycho (1960)       5
18512      American Werewolf in London, An (1981)       5
31039                         Pulp Fiction (1994)       5
10032             Star Trek: First Contact (1996)       5
...                                           ...     ...
30169        Star Trek: The Motion Picture (1979)       2
61533  Star Trek III: The Search for Spock (1984)       2
4983                Amityville Horror, The (1979)       2
42280                       Murder at 1600 (1997)       2
16234      Star Trek V: The Final Frontier (1989)       1

[79 rows x 2 columns]



In [49]:
from sentence_transformers import SentenceTransformer

In [50]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [51]:
df = merged_dataset.copy()

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      100000 non-null  int64 
 1   movie_id     100000 non-null  int64 
 2   rating       100000 non-null  int64 
 3   timestamp    100000 non-null  int64 
 4   movie_title  100000 non-null  object
dtypes: int64(4), object(1)
memory usage: 3.8+ MB


In [55]:
df['combined'] = (
    'user id: ' + df['user_id'].astype(str) + ', ' + 
    'movie id: ' + df['movie_id'].astype(str) + ', ' + 
    'rating: ' + df['rating'].astype(str) + ', ' + 
    'timestamp: ' + df['timestamp'].astype(str) + ', ' + 
    'movie title: ' + df['movie_title']
)

In [56]:
from llama_index.core import Document

documents = [
    Document(
        text=row['combined']
    )
    for _, row in df.iterrows()
]

### Weaviate Client

In [57]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [59]:
import weaviate
import os

client = weaviate.Client(
    url=os.getenv("WEAVIATE_URL"), # Replace with your Weaviate Cloud URL
    auth_client_secret=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),  # Replace w/ your Weaviate instance API key
    additional_headers={"X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],}
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [60]:
client.is_ready()

True

In [64]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

In [65]:
# construct vector store
vector_store = WeaviateVectorStore(
    weaviate_client=client, 
    index_name="LlamaIndex"
)

In [66]:
# Set up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [67]:
if client.schema.exists("LlamaIndex"):
    client.schema.delete_class("LlamaIndex")
    
# index = VectorStoreIndex(
#     documents,
#     storage_context = storage_context,
# )

In [68]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

In [69]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [76]:
response = chat_engine.stream_chat("Could you recommend three movies for user_id 28? ")
for token in response.response_gen:
    print(token, end="")

Based on user 28's previous ratings for "Star Wars (1977)" and "Unforgettable (1996)", I would recommend the following three movies:

1. The Matrix (1999) - A sci-fi action film that I think you would enjoy based on your rating for "Star Wars".
2. The Sixth Sense (1999) - A psychological thriller that I think you would appreciate.
3. The Shawshank Redemption (1994) - A classic drama that I believe you would like based on your rating for "Unforgettable".

I hope you find these recommendations interesting! Let me know if you need more suggestions.

### Sequential recommendations

In [100]:
def get_context_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watch the following movies: {movies}. What the next 3 movie would he be likely to watch next?"
    prompt += " Express your response with a key of 'next_movies' and a value representing your array of recommended movies."

  return prompt

In [101]:
# get prompt with movie titles
user_prompt = get_user_prompt(["Margin Call", "The Big Short", "Moneyball", "The Martian",])
print(user_prompt)

A user watch the following items: Margin Call, The Big Short, Moneyball, The Martian. What the next 3 movie would he/she be likely to watch next? Express your response with a key of 'next_movies' and a value representing your array of recommended movies.


In [105]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant movies for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [107]:
response = chat_engine.stream_chat(user_prompt)
for token in response.response_gen:
    print(token, end="")

{
    "next_movies": ["The Social Network", "Spotlight", "The Founder"]
}

### Rating Predictions

In [115]:
def get_context_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watch the following movies: {movies}."
    prompt += "Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text."

  return prompt

In [119]:
# get prompt with movie titles and ratings
user_prompt = get_context_prompt(["Margin Call: 4", "The Big Short: 4", "Moneyball: 4.5", "The Martian: 4",])
print(user_prompt)

A user watch the following movies: Margin Call: 4, The Big Short: 4, Moneyball: 4.5, The Martian: 4.Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text.


In [120]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Rating (ranging from 1 to 5, with 5 being the highest) on a movie, based on that user's previous ratings for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [121]:
response = chat_engine.stream_chat(user_prompt)
for token in response.response_gen:
    print(token, end="")

4.5