In [102]:
import tiktoken
from langchain_google_community import BigQueryLoader
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
#from langchain.llms import VertexAI as langchain_vertexai
from langchain_google_vertexai import VertexAI as langchain_vertexai
from langchain import PromptTemplate
from pathlib import Path as p
import pandas as pd
 


vertex_llm_text = langchain_vertexai(model_name="gemini-1.5-pro-002")


def estimate_token_length(text, model="gpt2"):
    """Estimates the token length of a given text using a specified model.

      Args:
        text: The input text.
        model: The model to use for tokenization (default: "gpt2").

      Returns:
        The estimated number of tokens.
      """

  
    enc = tiktoken.get_encoding(model)  

    # Tokenize the text and count tokens
    tokens = enc.encode(text)
    token_count = len(tokens)
    return token_count

def get_data(source_query_str: str=None,metadata_columns: str=None,page_content_columns: str=None, project_id: str=None, return_only_text:bool=False):
    
    loader = BigQueryLoader(
            query=source_query_str, project=project_id, metadata_columns=metadata_columns, page_content_columns=page_content_columns
        )
    documents = []
    all_texts=[]
    documents.extend(loader.load())
    if return_only_text:  
        all_texts=[doc.page_content.replace('description:',"",1) for doc in documents]
        
    return documents, '\n'.join(all_texts)

 
def summarize_docs(documents: list[object],question_prompt_template: str="", refine_prompt_template: str="" ,is_token_limit_exceeded: bool=False ):
    
    print(question_prompt_template)
    print(refine_prompt_template)
    question_prompt = PromptTemplate(template=question_prompt_template, input_variables=["text"])    
    if not is_token_limit_exceeded:        
        #if the token limit is in the context window range, use a stuffing method for summary
        chain = load_summarize_chain(vertex_llm_text, chain_type="stuff", 
                                     prompt=question_prompt)

    else:
        print('you are here')   
        refine_prompt = PromptTemplate(input_variables=["existing_answer", "text"], template=refine_prompt_template)
        print(refine_prompt_template)
        print('*****')  
        print(refine_prompt)
        
        # chain = load_summarize_chain(
        #     vertex_llm_text,
        #     chain_type="refine",
        #     question_prompt=question_prompt,
        #     refine_prompt=refine_prompt,
        #     return_intermediate_steps=False,
        #   )
        
    return chain.invoke(documents)


In [96]:
# Set the Gemini 1.5 Pro context window limit
context_window_limit = 200#2000000
PROJECT_ID = "nine-quality-test"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
source_query_str="select distinct combined_id,unique_id,content, chunk, trim(concat(ifnull(headline,''), CHR(10),  description)) as description from `nine-quality-test.vlt_media_embeddings_integration.vlt_all_media_content_text_embeddings` order by unique_id, chunk asc "

#this is the main prompt for summary
question_prompt_template = """
    You will be given different parts of texts. Provide a summary of the following text. Your result must be detailed and at least 2 paragraphs. 
    When summarizing, directly dive into the narrative or descriptions from the text without using introductory phrases like 'In this passage'. 
    Directly address the main events, characters, and themes, encapsulating the essence and significant details from the text in a flowing narrative. 
    The goal is to present a unified view of the content, continuing the story seamlessly as if the passage naturally progresses into the summary.

    TEXT: {text}
    SUMMARY:
"""

refine_prompt_template = (
    "Your job is to produce a final summary. Your task is to combine and refine these summaries into a final, comprehensive summary that covers all key events, characters, themes, and details.\n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
    "If the context isn't useful, return the original summary."
)
metadata_columns=["combined_id"]
page_content_columns=["description"]
documents, all_texts=get_data(source_query_str=source_query_str,metadata_columns=metadata_columns,page_content_columns=page_content_columns, project_id=PROJECT_ID, return_only_text=True)
    
 


In [104]:
#documents,all_texts=get_data(source_query_str,metadata_columns,page_content_columns,PROJECT_ID,return_only_text=True)
# Estimate the token length
estimated_token_length = estimate_token_length(all_texts,'cl100k_base') #cl100k_base
print(estimated_token_length)
message=""
is_token_limit_exceeded=False
if estimated_token_length > context_window_limit:
  message="Your text is too long for the Gemini 1.5 Pro context window. We are trying to chunk and return the result."
  is_token_limit_exceeded=True
  summary=summarize_docs(documents=documents,question_prompt_template=question_prompt_template,is_token_limit_exceeded=is_token_limit_exceeded )
else:
  message="Your text fits within the Gemini 1.5 Pro context window."
  summary=summarize_docs(documents=documents,question_prompt_template=question_prompt_template,refine_prompt_template=refine_prompt_template,is_token_limit_exceeded=is_token_limit_exceeded )
 
 

18027

    You will be given different parts of texts. Provide a summary of the following text. Your result must be detailed and at least 2 paragraphs. 
    When summarizing, directly dive into the narrative or descriptions from the text without using introductory phrases like 'In this passage'. 
    Directly address the main events, characters, and themes, encapsulating the essence and significant details from the text in a flowing narrative. 
    The goal is to present a unified view of the content, continuing the story seamlessly as if the passage naturally progresses into the summary.

    TEXT: {text}
    SUMMARY:


you are here

*****
input_variables=[] input_types={} partial_variables={} template=''


UnboundLocalError: local variable 'chain' referenced before assignment

In [99]:
refine_prompt_template

"Your job is to produce a final summary. Your task is to combine and refine these summaries into a final, comprehensive summary that covers all key events, characters, themes, and details.\nWe have provided an existing summary up to a certain point: {existing_answer}\nWe have the opportunity to refine the existing summary(only if needed) with some more context below.\n------------\n{text}\n------------\nGiven the new context, refine the original summaryIf the context isn't useful, return the original summary."

In [87]:
question_prompt = PromptTemplate(template=question_prompt_template, input_variables=["text"])    
refine_prompt = PromptTemplate(input_variables=["existing_answer", "text"], template=refine_prompt_template)

chain = load_summarize_chain(
    vertex_llm_text,
    chain_type="refine",
    question_prompt=question_prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=False,
)

In [None]:
summary['output_text']

In [None]:
 sql = f"""  
         WITH search_results AS
         (
              SELECT
              search_results.base.content as content,  
              search_results.base.combined_id as combined_id,
              search_results.base.unique_id,
              distance,  -- The computed distance (similarity score) between the embeddings
              search_results.base.asset_id,
              search_results.base.headline,
              ifnull(search_results.base.html_safe_text,search_results.base.description) as description,
              search_results.base.startOffset_seconds,
              search_results.base.endOffset_seconds,
              search_results.base.fileUri,
              search_results.base.asset_type,
              ROW_NUMBER() OVER (PARTITION BY  search_results.base.asset_id ORDER BY distance ASC) AS rank_within_document  -- Rank by distance within each document
              
            FROM
              VECTOR_SEARCH(     
                TABLE `{dataset}.{table}`, --source embedding table
                '{source_embedding_column}',  -- Column with the embedding vectors in the base table

                -- Use the query embedding computed in the previous step
                 (SELECT {json.dumps(query_embedding)} query_embedding),  -- The query embedding from the CTE (query_embedding)

                -- Return top-k closest matches (adjust k as necessary)
                top_k =>{ top_k  }, -- Top k most similar matches based on distance
                distance_type => 'COSINE',
                options => {options}                   
              ) search_results              
          ),          

             -- Step 2: Aggregate relevance per document (original_document_id)
            ranked_documents AS (
                SELECT
                    asset_id,        
                    MIN(distance) AS min_distance  -- Alternatively, you can use the average distance
                FROM search_results
                GROUP BY asset_id
            )

            -- Step 4: Retrieve the top-k ranked documents based on relevance
            SELECT * FROM (
              SELECT  
                sr.asset_id,  
                sr.headline,
                sr.description,
                sr.combined_id,
                sr.unique_id,
                sr.fileUri,
                sr.asset_type,
                sr.min_distance,
                ROW_NUMBER() OVER (PARTITION BY SR.asset_id ORDER BY min_distance ASC) AS IDX,
                STRING_AGG(CONCAT("""+"'{startOffset_seconds:', sr.startOffset_seconds, ',endOffset_seconds:', sr.endOffset_seconds, '}')"""+f""", ", " ) 
                OVER (PARTITION BY sr.asset_id ORDER BY sr.startOffset_seconds) AS time_lines
                --sr.distance,
                --final_rank--,
               -- rank_within_document
            FROM search_results sr
            JOIN ranked_documents rd ON sr.asset_id = rd.asset_id
            WHERE rd.final_rank <= {top_k} -- Return the top-k documents based on chunk relevance      
            --and sr.asset_id like '%00261507986b0faf31c775597d2d24beb4381e43%'
            ORDER BY rd.final_rank, sr.rank_within_document  -- Order by document relevance and chunk rank
            )
            WHERE IDX=1
    """       
    print(sql)
    bq_client = bigquery.Client(project_id)
  
    # Run the query
    query_job = bq_client.query(sql)

    # Fetch results
    results = query_job.result()
    
    output=[]
    for row in results:
        output.append({'asset_id':row['asset_id'], 'headline':row['headline'],'description':row['description'],'fileUri':row['fileUri'], "time_lines":row['time_lines'], "asset_type":row["asset_type"]})

    
    end_time = time.time()

    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    print(elapsed_time)
    return output