<a href="https://colab.research.google.com/github/istiaquehussain/ML-Staging/blob/main/Updated_RAG_Framework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U -q google.generativeai
!pip install pinecone-client
!pip install panda
!pip install langchain
!pip install langchain-community
!pip install langchain_text_splitters
!pip install pypdf


In [None]:
import pandas as pd
from google.colab import userdata
from pinecone import Pinecone, ServerlessSpec
import google.generativeai as genai
import json

from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document


In [None]:
class Extractor:

    def __init__(self):
        pass

    def generate_id_vector_meta(self,data_frame:pd,file_name:str,id_colum_name:str,vector_template:str,colums_to_vector:list,meta_template:str,colums_to_meta:list,meta_info:map,shouldFomatVector:bool=True,shouldFomatMeta:bool=True)->pd:
      data = data_frame.copy(deep=True)


      data['id'] = data[id_colum_name].apply(lambda x: f"{file_name}_{x}")

      def format_vector(row):
          formatted_strings=""
          if(shouldFomatVector):
            formatted_strings = [f"'{col}'= {row[col]}, " for col in colums_to_vector]
            formatted_strings = '\n'.join(formatted_strings)
          else:
            formatted_strings = [f"{row[col]} " for col in colums_to_vector]
            formatted_strings = ''.join([f"{row[col]} " for col in colums_to_vector])
          return formatted_strings

      data['vector_data'] = data.apply(format_vector, axis=1)

      def format_meta(row):
          formatted_strings=""
          if(shouldFomatMeta):
            formatted_strings = [f"'{col}'= {row[col]}, " for col in colums_to_meta]
            formatted_strings = '\n'.join(formatted_strings)
          else:
            formatted_strings = [f"{row[col]} " for col in colums_to_meta]
            formatted_strings = ''.join([f"{row[col]} " for col in colums_to_vector])
          return formatted_strings

      data['meta'] = data.apply(format_meta, axis=1)
      if(meta_info):
        mata_data="\n".join(f"'{key}'= {value}, " for key, value in meta_info.items())
        data['meta'] = data['meta'] + "\n"+mata_data
      return data[['id','vector_data','meta']]


In [None]:
class Loader:

    def __init__(self):
        pass

    def embade(self,genai,model,data)->pd:
      data_fame = data.copy(deep=True)
      data_fame['vector']=data_fame.apply(lambda row:genai.embed_content(model=model,content=row['vector_data'],task_type="retrieval_document")["embedding"],axis=1)
      columns_to_include = ['id','vector','meta']
      sub_df = data_fame[columns_to_include]
      return sub_df

    def create_vector(self,data):
      data_fame = data.copy(deep=True)
      data_fame_dic=data_fame.to_dict(orient='records')
      vectors = [
        (str(doc["id"]), doc['vector'], {"text": doc['meta']}) for doc in data_fame_dic
        ]
      return vectors

    def create_db_index(self,pc,index_name,dimention,metrix):
        pc.create_index(
          name=index_name,
          dimension=dimention, # Replace with your model dimensions
          metric=metrix, # Replace with your model metric
          spec=ServerlessSpec(
              cloud="aws",
              region="us-east-1"
          )
        )

    def drop_db_index(self,pc,index_name):
        pc.delete_index(index_name)

    def upsert_data(self,pc,index_name,data):
        db_index=pc.Index(index_name)
        db_index.upsert(data)

    def upsert_data_with_index(self,db_index,data):
        db_index.upsert(data)

In [None]:
from typing_extensions import Self
class VectorQuery:

    def __init__(self):
        pass

    def create_embaded_query(self,genai,model,query_string)->str:
      query_embedding = genai.embed_content(model=model,content=query_string,task_type="retrieval_query")["embedding"]
      return query_embedding

    def query_vector_db_with_meta(self,db_index,embaded_query_string,result_count):
      result=db_index.query(
        vector=embaded_query_string,
        top_k=result_count,
        include_metadata=True
      )
      return result

    def query_vector_db(self,genai,embaing_model,vector_db,db_index_name,query_string,result_count):
        #create embaded query
        create_embaded_query_string=Self.create_embaded_query(genai,embaing_model,query_string)
        #query db
        db_index=vector_db.Index(db_index_name)
        result=Self.query_vector_db_with_meta(db_index,create_embaded_query_string,result_count)
        return result



In [None]:
class Utils:

   def __init__(self):
        pass

   def sanitise_csv_data(self,data_frame:pd,colums_to_return:list)->pd:
      data_to_sanitised = data_frame.copy(deep=True)

      data_sanitised = data_to_sanitised[colums_to_return]
      data_sanitised['Resolution'] = data_sanitised['Resolution'].fillna('No resolution comments')
      data_sanitised['Customer Satisfaction Rating'] = data_sanitised['Customer Satisfaction Rating'].fillna('No ratings')
      data_sanitised.fillna('',inplace=True)

      def format_ticket_description(row):
        row_content=row['Ticket Description'].replace('{product_purchased}',row['Product Purchased'])
        return row_content

      data_sanitised["Ticket Description Corrected"] = data_sanitised.apply(format_ticket_description, axis=1)

      data_sanitised.drop(['Ticket Description'],axis=1,inplace=True)
      data_sanitised.rename(columns={'Ticket Description Corrected':'Ticket Description'},inplace=True)
      data_sanitised['Ticket Description'] = data_sanitised['Ticket Description'].str.replace('\n', ' ', regex=True)
      return data_sanitised


   def sanitise_pdf_data(self,folder_path)->pd:
      document_loader = PyPDFDirectoryLoader(folder_path)
      documents = document_loader.load()
      text_splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                                     chunk_overlap=80,
                                                     length_function=len,
                                                     is_separator_regex=False,)
      split_documents = text_splitter.split_documents(documents)
      data = {
        "id": [doc.metadata.get("source","")+"_"+str(doc.metadata.get("page","")) for doc in split_documents],
        "page_content": [doc.page_content for doc in split_documents],
        "reference": [doc.metadata.get("source", "") for doc in split_documents],
        "type": ["doc" for doc in split_documents]
      }

      documents_frame = pd.DataFrame(data)
      return documents_frame




   def transform_csv_vector(self,data:pd)->pd:
      data_frame = data.copy(deep=True)
      #data_frame['vector_transformed']=data_frame.apply(lambda row:row['vector'].toList,axis=1)
      data_frame['vector_transformed']=data_frame.apply(lambda row:json.loads(row['vector']),axis=1)
      data_frame.drop(['vector'],axis=1,inplace=True)
      data_frame.rename(columns={'vector_transformed':'vector'},inplace=True)
      return data_frame[['id','vector','meta']]

   def get_genai_template(self)->str:
    genai_query_template="""
      summarize below customer ticket system data :

      {context}

      always include 'Product Purchased','Ticket Description' and 'Customer Satisfaction Rating' in your response.
      """
    genai_query_template = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {query}
"""
    genai_query_template = """You are a data analyst. Given the following data of customer support tickets, provide a detailed summary of the information.Make sure to highlight key points and provide an overall summary of the data.

Data:
{context}

Summary:"""
    return genai_query_template






In [None]:
class LLMQuery:

    def __init__(self):
        pass

    def format_genai_query(self,query_template,context_data,query_string):
      results_meta = [row['metadata']['text']for row in context_data['matches']]
      context= "\n\n---\n\n".join([doc for doc in results_meta])
      query = query_template.format(context=context,query=query_string)
      return query

    def query_genai(self,genai,chat_model,query):
      model= genai.GenerativeModel(chat_model)
      response = model.generate_content(query)
      return response.text


In [None]:
class Env:

  def __init__(self):
        pass

  def get_embading_model(self)->str:
    EMBEDING_MODEL_NAME = 'models/embedding-001'
    return EMBEDING_MODEL_NAME

  def get_genai_model_name(self)->str:
    GENAI_MODEL_NAME = 'gemini-pro'
    return GENAI_MODEL_NAME

  def get_db_index_name(self)->str:
    #DB_INDEDX_NAME = "cs-tickets"
    DB_INDEDX_NAME = "vector-global"
    return DB_INDEDX_NAME

  def get_db_index_dimention(self)->int:
    DB_DIMENTION = 768
    return DB_DIMENTION

  def get_vector_db_metrix(self)->str:
    DB_METRIX = "cosine"
    return DB_METRIX

  def init_genai(self):
    #GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
    GOOGLE_API_KEY='AIzaSyA07k6kJvvPX9KbLxV3MFI7fi7H1M_gzQw'
    genai.configure(api_key=GOOGLE_API_KEY)
    return genai

  def init_vectordb(self):
    #PINE_CONE_API_KEY = userdata.get('PINE_CONE_API_KEY')
    PINE_CONE_API_KEY = '62d55ffd-3523-4e8e-898b-2acb98b824f7'
    db = Pinecone(api_key=PINE_CONE_API_KEY)
    return db

In [None]:
class RAG:
    def __init__(self,getai,vector_db,embading_model,vector_db_index_name,chat_model,genai_query_template,vector_db_dimension,vector_db_metrix,db_index):
        self.genai = genai
        self.vector_db = vector_db
        self.embading_model = embading_model
        self.vector_db_index_name = vector_db_index_name
        self.chat_model = chat_model
        self.genai_query_template = genai_query_template
        self.vector_db_dimension = vector_db_dimension
        self.vector_db_metrix = vector_db_metrix
        self.db_index = db_index
        pass


    # All extractor related functions

    def sanitise_csv_data(self,data:pd,colums_to_return:list)->pd:
      utils = Utils()
      return utils.sanitise_csv_data(data,colums_to_return)

    def generate_csv_id_vector_meta(self,data:pd,file_name:str,id_colum_name:str,vector_template:str,colums_to_vector:list,meta_template:str,colums_to_meta:list,meta_info:map=None,shouldFomatVector:bool=True,shouldFomatMeta:bool=True)->pd:
      extractor = Extractor()
      return extractor.generate_id_vector_meta(data,file_name,id_colum_name,vector_template,colums_to_vector,meta_template,colums_to_meta,meta_info,shouldFomatVector,shouldFomatMeta)

    def extract_csv_data(self,file_name:str,data:pd,id_colum_name:str,colums_to_return:list,meta_info:map=None,shouldFomatVector:bool=True,shouldFomatMeta:bool=True)->pd:
      sanitized_data = self.sanitise_csv_data(data,colums_to_return)
      id_vector_meta_data = self.generate_csv_id_vector_meta(sanitized_data,
                              file_name,
                              id_colum_name,
                              "vector_template",
                              data.columns.tolist(),
                              "meta_template",
                              data.columns.tolist(),
                              meta_info,
                              shouldFomatVector,
                              shouldFomatMeta)
      return id_vector_meta_data

    def extract_data(self,file_name:str,data:pd,id_colum_name:str,colums_to_return:list)->pd:
      utils = Utils()
      extractor = Extractor()
      sanitized_data = utils.sanitise_csv_data(data,colums_to_return)
      id_vector_meta_data = extractor.generate_id_vector_meta(sanitized_data,
                              file_name,
                              id_colum_name,
                              "vector_template",
                              data.columns.tolist(),
                              "meta_template",
                              data.columns.tolist())
      return id_vector_meta_data

    # All db related functions

    def create_vector_db_index(self):
      loader=Loader()
      loader.create_db_index(self.vector_db,self.vector_db_index_name,self.vector_db_dimension,self.vector_db_metrix)

    def drop_vector_db_index(self):
      loader=Loader()
      loader.drop_db_index(self.vector_db,self.vector_db_index_name)

    def upsert_data(self,data):
      loader=Loader()
      loader.upsert_data_with_index(self.db_index,data)

    # All loader related functions """
    def get_embaded_csv_data(self,data_fame)->pd:
      loader=Loader()
      return loader.embade(self.genai,self.embading_model,data_fame)

    def get_csv_vector_data(self,data_fame):
      loader=Loader()
      return loader.create_vector(data_fame)

    def load_csv_data(self,data:pd):
      embaded_data=self.get_embaded_csv_data(self.genai,self.embading_model,data)
      vactor=self.get_csv_vector_data(embaded_data)
      self.upsert_data(vactor)


    def load_data(self,data:pd):
      loader=Loader()
      embaded_data=loader.embade(self.genai,self.embading_model,data)
      vactor=loader.create_vector(embaded_data)
      loader.upsert_data(self.vector_db,self.vector_db_index_name,vactor)

    # All RAG related functions """

    def retrive(self,query:str,result_count:int):
      vector_factory=VectorQuery()
      embaded_query = vector_factory.create_embaded_query(self.genai,self.embading_model,query)
      db_index=self.vector_db.Index(self.vector_db_index_name)
      result = vector_factory.query_vector_db_with_meta(db_index,embaded_query,result_count)
      return result

    def augment(self,vactor_db_result,query):
      llm_query=LLMQuery()
      llm_prompt=llm_query.format_genai_query(self.genai_query_template,vactor_db_result,query)
      return llm_prompt

    def generate(self,llm_prompt):
      llm_query=LLMQuery()
      result = llm_query.query_genai(self.genai,self.chat_model,llm_prompt)
      return result

    def query(self,query,result_count):
      db_result = self.retrive(query,result_count)
      print("db_result-> "+str(db_result))
      prompt = self.augment(db_result,query)
      print("prompt-> "+str(prompt))
      result = self.generate(prompt)
      return result





In [None]:
ai = Env().init_genai()
db = Env().init_vectordb()
embading_model = Env().get_embading_model()
db_index_name = Env().get_db_index_name()
chat_model = Env().get_genai_model_name()

db_dimension = Env().get_db_index_dimention()
db_metrix = Env().get_vector_db_metrix()
ai_query_template = Utils().get_genai_template()
db_index=db.Index(db_index_name)

In [None]:
rag = RAG(ai,db,embading_model,db_index_name,chat_model,ai_query_template,db_dimension,db_metrix,db_index)
utils = Utils()

In [None]:
#rag.create_vector_db_index()

In [None]:
queryString="List top issue related to xbox"
#queryString="what is scala"
result = rag.query(queryString,3)
print(result)

db_result-> {'matches': [{'id': 'cstickets_23',
              'metadata': {'text': "'Ticket ID'= 23, \n"
                                   "'Customer Name'= Stephanie Nelson DVM, \n"
                                   "'Customer Email'= ljohnson@example.org, \n"
                                   "'Customer Age'= 54, \n"
                                   "'Customer Gender'= Female, \n"
                                   "'Product Purchased'= Xbox, \n"
                                   "'Date of Purchase'= 2020-02-11, \n"
                                   "'Ticket Type'= Cancellation request, \n"
                                   "'Ticket Subject'= Delivery problem, \n"
                                   "'Ticket Description'= I'm having an issue "
                                   'with the Xbox. Please assist. (And if need '
                                   'be this time, that could help.)  1.3.2.1 '
                                   'Update my version to 3.0 or more. The '
 

In [None]:
csv_data_raw=pd.read_csv('/content/drive/MyDrive/datasets/customer_support_tickets.csv')

In [None]:
csv_data_raw = csv_data_raw.head(50)

In [None]:
original_columns = csv_data_raw.columns.tolist()
columns_to_remove = ['First Response Time','Time to Resolution']
columns_to_use = list(filter(lambda x: x not in columns_to_remove, original_columns))

In [None]:
csv_data_sanitised=utils.sanitise_csv_data(csv_data_raw,columns_to_use)

In [None]:
meta_info={
    'reference':'Customer Support Tickets',
    'type':'csv',
}
csv_id_vector_meta=rag.generate_csv_id_vector_meta(csv_data_sanitised,"cstickets","Ticket ID","vector_template",columns_to_use,"meta_template",columns_to_use,meta_info)



In [None]:
csv_id_vector_meta

In [None]:
csv_id_embade_meta=rag.get_embaded_csv_data(csv_id_vector_meta)

In [None]:
csv_vector=rag.get_csv_vector_data(csv_id_embade_meta)

In [None]:
rag.upsert_data(csv_vector)

In [None]:
pdf_data_sanitised=utils.sanitise_pdf_data('/content/drive/MyDrive/datasets/pdfs')



In [None]:
pdf_data_cols=pdf_data_sanitised.columns.tolist()

In [None]:
pdf_data_cols

['id', 'page_content', 'reference', 'type']

In [None]:
pdf_id_vector_meta=rag.generate_csv_id_vector_meta(pdf_data_sanitised,"pdf","id","vector_template",['page_content'],"meta_template",['page_content'],None,False,False)


In [None]:
pdf_id_vector_meta

In [None]:
pdf_id_embade_meta=rag.get_embaded_csv_data(pdf_id_vector_meta)

In [None]:
pdf_vector=rag.get_csv_vector_data(pdf_id_embade_meta)

In [None]:
rag.upsert_data(pdf_vector)