In [1]:
import pandas as pd
import numpy as np

### LLM Prompting for Recommendation

Data Link: https://grouplens.org/datasets/movielens/100k/

In [2]:
DATASET_LINK='http://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -n ml-100k.zip

--2024-06-16 15:48:23--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2024-06-16 15:48:24 (16.9 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]

Archive:  ml-100k.zip


In [2]:
overall_stats = pd.read_csv('ml-100k/u.info', header=None)
print("Details of users, items and ratings involved in the loaded movielens dataset: ",list(overall_stats[0]))

Details of users, items and ratings involved in the loaded movielens dataset:  ['943 users', '1682 items', '100000 ratings']


In [3]:
## same item id is same as movie id, item id column is renamed as movie id
column_names1 = ['user_id','movie_id','rating','timestamp']
dataset = pd.read_csv('ml-100k/u.data', sep='\t',header=None,names=column_names1)
dataset.head() 

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
len(dataset), max(dataset['movie_id']),min(dataset['movie_id'])

(100000, 1682, 1)

In [5]:
d = 'movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')
column_names2

['movie_id',
 'movie_title',
 'release_date',
 'video_release_date',
 'IMDb_URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [6]:
items_dataset = pd.read_csv('ml-100k/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
items_dataset

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
movie_dataset = items_dataset[['movie_id','movie_title']]
movie_dataset.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [8]:
len(items_dataset.groupby(by=column_names2[1:])),len(items_dataset)

(1664, 1682)

In [9]:
merged_dataset = pd.merge(dataset, movie_dataset, how='inner', on='movie_id')
merged_dataset.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [10]:
merged_dataset[(merged_dataset['movie_title'] == 'Chasing Amy (1997)') & (merged_dataset['user_id'] == 894)]

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
62716,894,246,4,882404137,Chasing Amy (1997)
90596,894,268,3,879896041,Chasing Amy (1997)


In [11]:
num_users = len(merged_dataset['user_id'].value_counts())
num_items = len(merged_dataset['movie_title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))

Unique number of users in the dataset: 943
Unique number of movies in the dataset: 1664


In [12]:
from pprint import pprint

def user_movie_list(user_id):
    user_data = merged_dataset[merged_dataset['user_id'] == user_id][['movie_title', 'rating']]
    sorted_user_data = user_data.sort_values(by='rating', ascending=False)
    print("Movies seen by the User ranked by rating:")
    pprint(sorted_user_data)
    print("")

In [57]:
user_movie_list(user_id=82)

Movies seen by the User ranked by rating:
                                             movie_title  rating
20443                        Beauty and the Beast (1991)       5
83603                                  Casablanca (1942)       5
18645                              Apartment, The (1960)       5
19755                     Raiders of the Lost Ark (1981)       5
23907             Monty Python and the Holy Grail (1974)       5
...                                                  ...     ...
55841                               Mars Attacks! (1996)       1
55027              Heidi Fleiss: Hollywood Madam (1995)        1
19427                                  Phat Beach (1996)       1
8275                                 Stupids, The (1996)       1
4065   Tales from the Crypt Presents: Bordello of Blo...       1

[168 rows x 2 columns]



In [15]:
df = merged_dataset.copy()

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      100000 non-null  int64 
 1   movie_id     100000 non-null  int64 
 2   rating       100000 non-null  int64 
 3   timestamp    100000 non-null  int64 
 4   movie_title  100000 non-null  object
dtypes: int64(4), object(1)
memory usage: 3.8+ MB


In [17]:
df['combined'] = (
    'user id: ' + df['user_id'].astype(str) + ', ' + 
    'movie id: ' + df['movie_id'].astype(str) + ', ' + 
    'rating: ' + df['rating'].astype(str) + ', ' + 
    'timestamp: ' + df['timestamp'].astype(str) + ', ' + 
    'movie title: ' + df['movie_title']
)

In [18]:
from llama_index.core import Document

documents = [
    Document(
        text=row['combined']
    )
    for _, row in df.iterrows()
]

### Weaviate Client

In [2]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [3]:
import weaviate
import os

client = weaviate.Client(
    url=os.getenv("WEAVIATE_URL"), # Replace with your Weaviate Cloud URL
    auth_client_secret=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),  # Replace w/ your Weaviate instance API key
    additional_headers={"X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],}
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [4]:
client.is_ready()

True

In [5]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# construct vector store
vector_store = WeaviateVectorStore(
    weaviate_client=client, 
    index_name="LlamaIndex"
)

In [45]:
# Set up the storage for the embeddings
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# if client.schema.exists("LlamaIndex"):
#     client.schema.delete_class("LlamaIndex")
    
# index = VectorStoreIndex(
#     documents,
#     storage_context = storage_context,
# )

In [7]:
import json

index = VectorStoreIndex.from_vector_store(vector_store)

In [10]:
def get_context_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watch the following movies: {movies}. What the next 3 movie would he be likely to watch next?"
  return prompt

In [11]:
# get prompt with movie titles
user_prompt = get_context_prompt(["Margin Call", "The Big Short", "Moneyball", "The Martian",])
print(user_prompt)

A user watch the following movies: Margin Call, The Big Short, Moneyball, The Martian. What the next 3 movie would he be likely to watch next?


In [13]:
# zero-shot prompt
query_engine = index.as_query_engine(similarity_top_k=2)
response = query_engine.query(user_prompt)
print(str(response))

The next 3 movies the user would likely watch next are The Wolf of Wall Street, The Social Network, and The Pursuit of Happyness.


In [None]:
# few-shot prompting

User’s Watched Movie: “The Shawshank Redemption” 
User’s Rating: 5 (out of 5) 
Recommended Movie: “The Green Mile” 
Explanation: Both movies are critically acclaimed drama films with themes of hope and resilience in difficult circumstances. Since the user highly rated “The Shawshank Redemption”, they might also enjoy “The Green Mile”.

User’s Watched Movie: “Inception” 
User’s Rating: 4 (out of 5) 
Recommended Movie: “Interstellar” 
Explanation: Both movies are science-fiction films directed by Christopher Nolan, known for their mind-bending plots and impressive visuals. A user who enjoyed “Inception” is likely to appreciate “Interstellar” as well.

User’s Watched Movie: “The Dark Knight” User’s Rating: 5 
Recommended Movie:

In [17]:
from IPython.display import Markdown, display
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))

In [18]:
prompts_dict = query_engine.get_prompts()

In [19]:
display_prompt_dict(prompts_dict)

**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

In [8]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

In [None]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [9]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [None]:
response = chat_engine.stream_chat("Could you recommend three movies for user_id 82? ")
for token in response.response_gen:
    print(token, end="")

In [56]:
response = chat_engine.stream_chat("Could you recommend three movies for user_id 82? ")
for token in response.response_gen:
    print(token, end="")

For user_id 82, based on their previous rating of 4 for "My Favorite Year (1982)", I can recommend the following three movies that they might enjoy:

1. Singin' in the Rain (1952) - A classic musical comedy about the transition from silent films to "talkies" in Hollywood.
2. The Artist (2011) - A silent black-and-white film that pays homage to the silent era of Hollywood.
3. Tootsie (1982) - A comedy film about an actor who disguises himself as a woman to land a role on a soap opera.

I hope these recommendations align with their taste! Let me know if you'd like more suggestions.

### Sequential recommendations

In [58]:
def get_context_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watch the following movies: {movies}. What the next 3 movie would he be likely to watch next?"
    prompt += " Express your response with a key of 'next_movies' and a value representing your array of recommended movies."

  return prompt

In [59]:
# get prompt with movie titles
user_prompt = get_context_prompt(["Margin Call", "The Big Short", "Moneyball", "The Martian",])
print(user_prompt)

A user watch the following movies: Margin Call, The Big Short, Moneyball, The Martian. What the next 3 movie would he be likely to watch next? Express your response with a key of 'next_movies' and a value representing your array of recommended movies.


In [60]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Here are the relevant movies for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [61]:
response = chat_engine.stream_chat(user_prompt)
for token in response.response_gen:
    print(token, end="")

{
    "next_movies": ["The Wolf of Wall Street", "Inside Job", "Wall Street"]
}

### Rating Predictions

In [62]:
def get_context_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watch the following movies: {movies}."
    prompt += "Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text."

  return prompt

In [68]:
# get prompt with movie titles and ratings
user_prompt = get_context_prompt(["Margin Call: 4", "The Big Short: 4", "Moneyball: 4.5", "The Martian: 4",])
print(user_prompt)

A user watch the following movies: Margin Call: 4.0, The Big Short: 4.0, Moneyball: 4.5, The Martian: 4.0.Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text.


In [69]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Use the following context including user id, movie id, movie titles, timestamp, and rating to answer any questions"
        "It's ok if you don't know the answer."
        "Rating (ranging from 1 to 5, with 5 being the highest) on a movie, based on that user's previous ratings for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help the user."
    ),
    verbose=False,
)

In [70]:
response = chat_engine.stream_chat(user_prompt)
for token in response.response_gen:
    print(token, end="")

4.