In [1]:
import pandas as pd
import numpy as np

### LLM Prompting for Recommendation

Data Link: https://grouplens.org/datasets/movielens/100k/

In [2]:
DATASET_LINK='http://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [3]:
!wget -nc http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -n ml-100k.zip

File ‘ml-100k.zip’ already there; not retrieving.

Archive:  ml-100k.zip


In [3]:
overall_stats = pd.read_csv('ml-100k/u.info', header=None)
print("Details of users, items and ratings involved in the loaded movielens dataset: ",list(overall_stats[0]))

Details of users, items and ratings involved in the loaded movielens dataset:  ['943 users', '1682 items', '100000 ratings']


In [4]:
## same item id is same as movie id, item id column is renamed as movie id
column_names1 = ['user_id','movie_id','rating','timestamp']
rating_dataset = pd.read_csv('ml-100k/u.data', sep='\t',header=None,names=column_names1)
rating_dataset.head() 

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
len(rating_dataset), max(rating_dataset['movie_id']),min(rating_dataset['movie_id'])

(100000, 1682, 1)

In [6]:
d = 'movie_id | movie_title | release_date | video_release_date | IMDb_URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')
column_names2

['movie_id',
 'movie_title',
 'release_date',
 'video_release_date',
 'IMDb_URL',
 'unknown',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [7]:
items_dataset = pd.read_csv('ml-100k/u.item', sep='|',header=None,names=column_names2,encoding='latin-1')
items_dataset

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
movie_dataset = items_dataset[['movie_id','movie_title']]
movie_dataset.head()

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [9]:
len(items_dataset.groupby(by=column_names2[1:])),len(items_dataset)

(1664, 1682)

In [10]:
df = pd.merge(rating_dataset, movie_dataset, how='inner', on='movie_id')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [11]:
df[(df['movie_title'] == 'Chasing Amy (1997)') & (df['user_id'] == 894)]

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title
62716,894,246,4,882404137,Chasing Amy (1997)
90596,894,268,3,879896041,Chasing Amy (1997)


In [12]:
num_users = len(df['user_id'].value_counts())
num_items = len(df['movie_title'].value_counts())
print('Unique number of users in the dataset: {}'.format(num_users))
print('Unique number of movies in the dataset: {}'.format(num_items))

Unique number of users in the dataset: 943
Unique number of movies in the dataset: 1664


In [13]:
from pprint import pprint

def user_movie_list(user_id):
    user_data = df[df['user_id'] == user_id][['movie_title', 'rating']]
    sorted_user_data = user_data.sort_values(by='rating', ascending=False)
    print("Movies seen by the User ranked by rating:")
    pprint(sorted_user_data)
    print("")

In [14]:
user_movie_list(85)

Movies seen by the User ranked by rating:
                                movie_title  rating
7644   Bridge on the River Kwai, The (1957)       5
42282                   Mary Poppins (1964)       5
3160                       Manhattan (1979)       5
55283                     Casablanca (1942)       5
23014            Singin' in the Rain (1952)       5
...                                     ...     ...
24276       Star Trek: First Contact (1996)       2
24266              Full Metal Jacket (1987)       2
2225                     Restoration (1995)       2
96801                      Game, The (1997)       1
55049                   White Squall (1996)       1

[288 rows x 2 columns]



### Merge rating and movie

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      100000 non-null  int64 
 1   movie_id     100000 non-null  int64 
 2   rating       100000 non-null  int64 
 3   timestamp    100000 non-null  int64 
 4   movie_title  100000 non-null  object
dtypes: int64(4), object(1)
memory usage: 3.8+ MB


In [16]:
df['combined'] = (
    'user id: ' + df['user_id'].astype(str) + ', ' + 
    'movie id: ' + df['movie_id'].astype(str) + ', ' + 
    'rating: ' + df['rating'].astype(str) + ', ' + 
    'timestamp: ' + df['timestamp'].astype(str) + ', ' + 
    'movie title: ' + df['movie_title']
)

In [17]:
from llama_index.core import Document

documents = [
    Document(
        text=row['combined']
    )
    for _, row in df.iterrows()
]

  from .autonotebook import tqdm as notebook_tqdm


### Weaviate Client

In [2]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

In [3]:
import weaviate
import os

client = weaviate.Client(
    url=os.getenv("WEAVIATE_URL"), # Replace with your Weaviate Cloud URL
    auth_client_secret=weaviate.auth.AuthApiKey(os.getenv("WEAVIATE_API_KEY")),  # Replace w/ your Weaviate instance API key
    additional_headers={"X-Cohere-Api-Key": os.environ['COHERE_API_KEY'],}
)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [4]:
client.is_ready()

True

In [5]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# construct vector store
vector_store = WeaviateVectorStore(
    weaviate_client=client, 
    index_name="LlamaIndex"
)

In [7]:
# Set up the storage for the embeddings
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

# if client.schema.exists("LlamaIndex"):
#     client.schema.delete_class("LlamaIndex")
    
# index = VectorStoreIndex(
#     documents,
#     storage_context = storage_context,
# )

In [8]:
import json

index = VectorStoreIndex.from_vector_store(vector_store)

### Converstional recommendation

In [10]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=3900)

In [11]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Your job is to recommend new movies."
        "It's ok if you don't know the answer."
        "Here are the relevant data for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help answer the question."
    ),
    verbose=False,
)

In [12]:
response = chat_engine.stream_chat("Could you recommend three movies for user_id 42? ")
for token in response.response_gen:
    print(token, end="")

Of course! Based on the previous ratings of user_id 42, I would recommend the following three movies:

1. The Shawshank Redemption (1994) - A highly acclaimed drama that many viewers have enjoyed.
2. Forrest Gump (1994) - A heartwarming and iconic film that is beloved by many.
3. The Dark Knight (2008) - Since user_id 42 enjoyed "Batman (1989)", they might also like this modern take on the Batman story.

I hope these recommendations are helpful! Let me know if you'd like more suggestions.

### Sequential recommendations

In [13]:
def get_movie_list(list_of_movies):
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt = f"A user watched the following movies: {movies}. What are the next 3 movies the user would be likely to watch next?" 
    return prompt

user_query = get_movie_list(["Margin Call", "The Big Short", "Moneyball", "The Martian",])
print(user_query)

A user watched the following movies: Margin Call, The Big Short, Moneyball, The Martian. What are the next 3 movies the user would be likely to watch next?


In [15]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Your job is to recommend new movies."
        "It's ok if you don't know the answer."
        "Here are the relevant data for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help answer the question."
    ),
    verbose=False,
)

In [16]:
response = chat_engine.stream_chat(user_query)
for token in response.response_gen:
    print(token, end="")

Based on the movies the user has watched so far (Margin Call, The Big Short, Moneyball, The Martian), it seems like they enjoy movies with a focus on finance, business, and/or real-life events. 

With that in mind, here are three movie recommendations for the user:

1. The Wolf of Wall Street (2013) - A high-energy film based on the true story of Jordan Belfort, a stockbroker who engaged in corrupt practices on Wall Street.
2. Inside Job (2010) - A documentary that provides an in-depth analysis of the 2008 financial crisis, which could be of interest to someone who enjoyed movies like "Margin Call" and "The Big Short".
3. The Social Network (2010) - A biographical drama about the founding of Facebook and the legal battles that ensued, which could appeal to someone interested in real-life events and business-related stories.

I hope these recommendations align with the user's movie preferences! Let me know if you need more suggestions.

### Rating Predictions

In [17]:
def get_movie_prompt(list_of_movies):
   # assemble user prompt
  prompt = None
  if len(list_of_movies) > 0:
    movies = ', '.join(list_of_movies)
    prompt =  f"A user watched the following movies: {movies}."
    prompt += "Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text."

  return prompt

In [18]:
# get prompt with movie titles and ratings
user_prompt = get_movie_prompt(["Moneyball: 4.5", "The Martian: 4", "Pitch Black 3.5", "Margin Call: 4", ])
print(user_prompt)

A user watched the following movies: Moneyball: 4.5, The Martian: 4, Pitch Black 3.5, Margin Call: 4.Predict the user's rating on 'The Matrix'. Output the rating score only. Do not include other text.


In [20]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    memory=memory,
    context_prompt=(
        "You are a friendly, conversational movie recommender assistant. Your job is to recommend new movies."
        "It's ok if you don't know the answer."
        "Here are the relevant data for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous movie rating history, or the context above, to interact and help answer the question."
    ),
    verbose=False,
)

In [21]:
response = chat_engine.stream_chat(user_prompt)
for token in response.response_gen:
    print(token, end="")

4.