In [2]:
from langchain.memory import ConversationBufferMemory
from langchain.agents import ConversationalChatAgent, AgentExecutor
import os
from langchain.chat_models import ChatOpenAI
from langchain_community.llms.sagemaker_endpoint import LLMContentHandler
from dotenv import load_dotenv
from typing import Dict

import boto3

from langchain_community.llms import SagemakerEndpoint
import json
#  Retrieve the OpenAI API key and temperature from environment variables
from dotenv import load_dotenv
load_dotenv("env.txt")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_TEMPERATURE = float(os.getenv('OPENAI_TEMPERATURE'))
HUGGING_FACE_TOKEN = os.getenv('HUGGING_FACE_TOKEN')

In [None]:
# import json
# import sagemaker
# import boto3
# from sagemaker.huggingface import HuggingFaceModel, get_huggingface_llm_image_uri

# try:
# 	role = sagemaker.get_execution_role()
# except ValueError:
# 	iam = boto3.client('iam')
# 	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# # Hub Model configuration. https://huggingface.co/models
# hub = {
# 	'HF_MODEL_ID':'meta-llama/Meta-Llama-3-8B-Instruct',
# 	'SM_NUM_GPUS': json.dumps(1),
# 	'HUGGING_FACE_HUB_TOKEN': HUGGING_FACE_TOKEN
# }

# #assert hub['HUGGING_FACE_HUB_TOKEN'] != '<REPLACE WITH YOUR TOKEN>', "You have to provide a token."

# # create Hugging Face Model Class
# huggingface_model = HuggingFaceModel(
# 	image_uri=get_huggingface_llm_image_uri("huggingface",version="2.0.2"),
# 	env=hub,
# 	role=role, 
# )

# # deploy model to SageMaker Inference
# predictor = huggingface_model.deploy(
# 	initial_instance_count=1,
# 	instance_type="ml.g5.2xlarge",
# 	container_startup_health_check_timeout=300,
#   )
  
# # send request
# predictor.predict({
# 	"inputs": "My name is Clara and I am",
# })

In [3]:
import json
from pathlib import Path
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
from langchain_community.llms import SagemakerEndpoint

class EmbeddingManager:
    def __init__(self, model_name):
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
    
    def get_embeddings(self):
        return self.embeddings

class VectorSpaceManager:
    def __init__(self, embedding_manager):
        self.embedding_manager = embedding_manager
        self.embeddings = self.embedding_manager.get_embeddings()

    def create_vector_space(self, documents):
        vector_store = FAISS.from_documents(self._split_docs(documents[:2]), self.embeddings)

        with tqdm(total=len(documents), desc="Creating vector space") as pbar:
            batch_size = 100
            for i in range(2, len(documents), batch_size):
                batch_documents = documents[i:i+batch_size]
                tempt_vector_store = FAISS.from_documents(self._split_docs(batch_documents), self.embeddings)
                vector_store.merge_from(tempt_vector_store)
                pbar.update(len(batch_documents))
        #vector_store = FAISS.from_documents(self._split_docs(documents), self.embeddings)

        return vector_store

    def save_vector_space(self, vector_store, save_path):
        print(f"Saving vector space to {save_path}...")
        vector_store.save_local(save_path)
        print(f"Finished!")

    def load_vector_space(self, save_path):
        print(f"Lodaing vector space from {save_path}")
        return FAISS.load_local(save_path, self.embeddings, allow_dangerous_deserialization=True)
    def _split_docs(self, docs: list):
        # split documents into chunks
        text_splitter = RecursiveCharacterTextSplitter(
        separators=[" ", "\n", ","],
        chunk_size=1000,
        chunk_overlap=0,
        )
        split_docs = text_splitter.split_documents(docs)
        return split_docs


class DataLoader:
    def __init__(self, json_file_path):
        self.json_file_path = json_file_path

    def load_data(self):
        data = json.loads(Path(self.json_file_path).read_text())
        return data

    def create_documents(self, length=None):
        data = self.load_data()
        if length is None:
            length = len(data)
        
        documents = [
            Document(
                page_content=self.get_page_content(item),
                metadata=item
            )
            for item in data[:length]
        ]
        return documents

    def get_page_content(self, item):
        raise NotImplementedError("Subclasses must implement get_page_content method")
    

class BookDataLoader(DataLoader):
    def get_page_content(self, item):
        return f"{item['title']} {item['author']} {item['publication_date']} {item['description']} {' '.join(item['genres'])}"
    

class MovieDataLoader(DataLoader):
    def get_page_content(self, item):
        # {movie_id, title, release_date, supported_languages, movie_countries, movie_genres_list, movie_actor_list, summary}
        # Use all the fields to create the page content
        return f"{item['title']} {item['release_date']} {item['summary']} {' '.join(item['movie_genres_list'])} {' '.join(item['movie_actor_list'])}"

def process_data(json_file_path, model_name, save_path, data_loader_class, length=None):
    # Initialize the embedding manager with the chosen model
    embedding_manager = EmbeddingManager(model_name)

    # Initialize the vector space manager with the embedding manager
    vector_space_manager = VectorSpaceManager(embedding_manager)

    # Load data and create documents
    data_loader = data_loader_class(json_file_path)
    documents = data_loader.create_documents(length=length)

    # Create and save the vector space
    vector_store = vector_space_manager.create_vector_space(documents)
    vector_space_manager.save_vector_space(vector_store, save_path)

    # Load the vector space and perform a search
    vector_store = vector_space_manager.load_vector_space(save_path)
    query = "The Hobbit"
    search_results = vector_store.search(query, k=2, search_type="similarity")
    print(search_results)

# Example usage
if __name__ == "__main__":
    # Book
    json_file_path = 'data/BookSummaries/book.json'  # Replace with the actual JSON file path
    #model_name = "sentence-transformers/all-MiniLM-L6-v2"
    #model_name = "mixedbread-ai/mxbai-embed-large-v1"
    #model_name = "intfloat/e5-base-v2"
    model_name = "BAAI/bge-small-en-v1.5"
    save_path = 'data/book_vector_store'  # Replace with the actual save path
    process_data(json_file_path, model_name, save_path, BookDataLoader, 100)

    # Movie
    json_file_path = 'data/MovieSummaries/movie.json'  # Replace with the actual JSON file path
    #model_name = "intfloat/e5-base-v2"
    model_name = "BAAI/bge-small-en-v1.5"
    #model_name = "sentence-transformers/all-MiniLM-L6-v2"
    #model_name = "mixedbread-ai/mxbai-embed-large-v1"
    save_path = 'data/movie_vector_store'  # Replace with the actual save path
    process_data(json_file_path, model_name, save_path, MovieDataLoader, 100)

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Creating vector space:  98%|█████████▊| 98/100 [01:18<00:01,  1.25it/s]


Saving vector space to data/book_vector_store...
Finished!
Lodaing vector space from data/book_vector_store
[Document(page_content='The Hobbit J. R. R. Tolkien 1937  Gandalf tricks Bilbo into hosting a party for Thorin and his band of dwarves, who sing of reclaiming the Lonely Mountain and its vast treasure from the dragon Smaug. When the music ends, Gandalf unveils a map showing a secret door into the Mountain and proposes that the dumbfounded Bilbo serve as the expedition\'s "burglar". The dwarves ridicule the idea, but Bilbo, indignant, joins despite himself. The group travel into the wild, where Gandalf saves the company from trolls and leads them to Rivendell, where Elrond reveals more secrets from the map. Passing over the Misty Mountains, they are caught by goblins and driven deep underground. Although Gandalf rescues them, Bilbo gets separated from the others as they flee the goblins. Lost in the goblin tunnels, he stumbles across a mysterious ring and then encounters Gollum, w

Creating vector space:  98%|█████████▊| 98/100 [00:12<00:00,  8.10it/s]

Saving vector space to data/movie_vector_store...
Finished!
Lodaing vector space from data/movie_vector_store
[Document(page_content="This convinces Xi that he has reached the edge of the world, and he throws the bottle off the cliff (this scene was filmed at God's Window in Eastern Transvaal, South Africa . Xi then returns to his band and a warm welcome from his family. Adventure Action/Adventure Indie World cinema Cult Adventure Comedy Comedy Slapstick Nic De Jager Michael Thys Fanyana H. Sidumo Brian O'Shaughnessy Vera Blacker Joe Seakatsie Ken Gampu Jamie Uys Sandra Prinsloo Paddy O'Byrne Louw Verwey Marius Weyers N!xau", metadata={'movie_id': '261237', 'title': 'The Gods Must Be Crazy', 'release_date': '1980', 'supported_languages': ['Afrikaans Language', 'English Language'], 'movie_countries': ['South Africa'], 'movie_genres_list': ['Adventure', 'Action/Adventure', 'Indie', 'World cinema', 'Cult', 'Adventure Comedy', 'Comedy', 'Slapstick'], 'movie_actor_list': ['Nic De Jager', 'M




In [None]:
class TopicClassifier:
    def __init__(self, llm):
        """
        Initializes a TopicClassifier object.

        Parameters:
        llm (LanguageModel): The language model used for classification.

        Returns:
        None
        """
        self.llm = llm
        self.topics = ["movies", "books", "others"]


    def classify(self, query):
        """
        Classifies a given query into one of the predefined topics.

        Parameters:
        query (str): The query to be classified.

        Returns:
        str: The classified topic.
        """
        prompt = f"Classify the following question into one of these topics: '{','.join(self.topics)}': '{query}'"
        response = self.llm.predict(text=prompt, max_tokens=10)
        topic = response.strip().lower()
        return topic

In [None]:
import torch
from transformers import AutoModelForCausalLM,
AutoTokenizer
from llama_index.llms.huggingface import HuggingFaceLLM

In [None]:
from llama_index.core import PromptTemplate
system_prompt = "You are a Q&A assistant. Your goal is to answer questions as accurately as possible based on the instructions and context provided."
# This will wrap the default prompts that are internal to llama-index
query_wrapper_prompt = PromptTemplate("<|USER|>{query_str}<|ASSISTANT|>")