In [1]:
import faiss
import yaml, os
import numpy as np
import pandas as pd
from llama_index import Document
from llama_index.llms import AzureOpenAI
from llama_index.llm_predictor import LLMPredictor
from llama_index import set_global_service_context
from llama_index.vector_stores import FaissVectorStore
from llama_index.embeddings import HuggingFaceEmbedding 
from llama_index import VectorStoreIndex, ServiceContext, StorageContext

pd.set_option('display.max_columns', 100)

In [2]:
with open('/Users/1zuu/Desktop/LLM RESEARCH/LLMPro/cadentials.yaml') as f:
    credentials = yaml.load(f, Loader=yaml.FullLoader)

os.environ['AD_OPENAI_API_KEY'] = credentials['AD_OPENAI_API_KEY']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = credentials['HUGGINGFACEHUB_API_TOKEN']

data_path = 'movies_metadata.csv'

In [3]:
def preprocess_dataset(row):
    belongs_to_collection = row['belongs_to_collection']
    belongs_to_collection = 'NULL' if pd.isnull(belongs_to_collection) else belongs_to_collection
    belongs_to_collection = eval(belongs_to_collection)['name'] if belongs_to_collection != 'NULL' else 'NULL'

    genres = row['genres']
    genres = 'NULL' if pd.isnull(genres) else genres
    if genres != 'NULL':
        genres = eval(genres)
        genres = [genre['name'] for genre in genres]
        
    row['belongs_to_collection'] = belongs_to_collection
    row['genres'] = genres

    return row

In [4]:
def load_corpus(data_path, verbose=False):
    df = pd.read_csv(data_path)
    df = df.apply(preprocess_dataset, axis=1)
    df = df[df['budget'] > 1000000]
    df = df[[
            'adult',
            'belongs_to_collection',
            'budget',
            'genres',
            'original_language',
            'original_title',
            'overview',
            'popularity',
            'revenue',
            'runtime',
            'vote_average',
            'vote_count'
            ]]

    if verbose:
        print(f"Loading {len(df)} movies from {data_path}")

    documents = []
    for i, row in df.iterrows():
        doc = Document(
                    id=i,
                    text=row['overview'],
                    metadata={
                            'title': row['original_title'],
                            'genres': ', '.join(row['genres']),
                            'belongs_to_collection': row['belongs_to_collection'],
                            'budget': row['budget'],
                            'popularity': row['popularity'],
                            'revenue': row['revenue'],
                            'runtime': row['runtime'],
                            'vote_average': row['vote_average'],
                            'vote_count': row['vote_count']
                            })
        documents.append(doc)

    return documents, df

In [5]:
embedding_llm = HuggingFaceEmbedding(
                                    model_name="BAAI/bge-small-en-v1.5",
                                    device="mps"
                                    )
llm=AzureOpenAI(
                deployment_name=credentials['AD_DEPLOYMENT_ID'],
                model=credentials['AD_ENGINE'],
                api_key=credentials['AD_OPENAI_API_KEY'],
                api_version=credentials['AD_OPENAI_API_VERSION'],
                azure_endpoint=credentials['AD_OPENAI_API_BASE']
                )
chat_llm = LLMPredictor(llm)

service_context = ServiceContext.from_defaults(
                                                embed_model=embedding_llm,
                                                llm_predictor=chat_llm
                                                )
set_global_service_context(service_context)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
documents, df = load_corpus(data_path, verbose=True)
if not os.path.exists('generated/movie_index'):
    faiss_index = faiss.IndexFlatIP(384)
    movie_index = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=movie_index)
    movie_index = VectorStoreIndex.from_documents(
                                                documents=documents,
                                                storage_context=storage_context,
                                                batch_size=100,
                                                show_progress_bar=True,
                                                )
    movie_index.storage_context.persist('./generated/movie_index')
else:
    movie_index = FaissVectorStore.from_persist_dir('./generated/movie_index')

Loading 7208 movies from movies_metadata.csv


In [7]:
query_engine = movie_index.as_query_engine(similarity_top_k=3)

In [18]:
data_response = query_engine.query('What is the best movie related to Romance genre which has best ratings but budget is less than 16000000 and runtime less than 100? return the name, rating, budget, runtime for the movie.')
data_response.response

'The movie that fits the criteria is "Flashdance" with a rating of 6.1, a budget of 4000000 and a runtime of 95.0.'

In [19]:
df_check = df[np.logical_and(df['vote_average'] >= 6, df['budget'] < 16000000)][df['runtime'] < 100]
df_check = df_check[df_check['genres'].apply(lambda x: 'Romance' in x)]
df_check = df_check.sort_values(by='vote_average', ascending=False)
df_check = df_check.sort_values(by='budget', ascending=True)
df_check.reset_index(inplace=True)
df_check

  df_check = df[np.logical_and(df['vote_average'] >= 6, df['budget'] < 16000000)][df['runtime'] < 100]


Unnamed: 0,index,adult,belongs_to_collection,budget,genres,original_language,original_title,overview,popularity,revenue,runtime,vote_average,vote_count
0,22348,False,,1085400,"[Drama, Romance]",fi,Kerron sinulle kaiken,At first glance Maarit looks like any other wo...,1.130550,0.0,95.0,6.4,7.0
1,7678,False,,1114000,"[Drama, History, Romance]",en,Queen Christina,Queen Christina of Sweden is a popular monarch...,2.886254,2610000.0,99.0,6.8,24.0
2,6940,False,,1152000,"[Drama, Romance]",en,Anna Karenina,In 19th century Russia a woman in a respectabl...,1.810574,1439000.0,95.0,6.6,25.0
3,9439,False,,1200000,"[Comedy, Drama, Romance]",en,Goldfish Memory,A small group of friends experience relationsh...,1.175971,0.0,85.0,6.2,6.0
4,1191,False,,1200000,"[Comedy, Drama, Romance]",en,Harold and Maude,The young Harold lives in his own world of sui...,10.878112,0.0,91.0,7.7,266.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,5530,False,,15000000,"[Drama, Family, Fantasy, Romance, Science Fict...",en,Tuck Everlasting,Natalie Babbitt's award winning book for child...,5.951492,19158074.0,90.0,6.4,133.0
118,11481,False,,15000000,"[Drama, Romance]",en,Notes on a Scandal,A veteran high school teacher befriends a youn...,8.593087,49469904.0,92.0,6.9,239.0
119,2680,False,National Lampoon's Vacation Collection,15000000,"[Comedy, Adventure, Romance]",en,National Lampoon's Vacation,Clark Griswold is on a quest to take his famil...,7.747432,61399552.0,98.0,7.1,412.0
120,11220,False,,15000000,"[Romance, Comedy, Drama, Fantasy]",fr,Angel-A,A beautiful and mysterious woman helps an inep...,5.512501,0.0,91.0,6.8,172.0
