### reference
https://partner.cloudskillsboost.google/course_templates/948/video/485086

In [None]:
!pip3 install --upgrade --user google-cloud-aiplatform umap-learn tqdm pypdf

In [73]:
from vertexai.preview.generative_models import GenerativeModel
from vertexai.language_models import TextEmbeddingModel
from vertexai.preview.generative_models import (
    Content,
    GenerationConfig,
    GenerationResponse,
    GenerativeModel,
    Image,
    Part as GenerativeModelPart,
    HarmBlockThreshold,
    HarmCategory,
)

text_embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-004")
model = GenerativeModel('gemini-pro')

In [44]:
import pandas as pd

data=pd.read_csv('testdata.csv',encoding='latin1')
data

Unnamed: 0,title,content
0,intelia ethos,\t\n#delightthecustomer\n\nOur customers' succ...
1,Migrating Data from Legacy Enterprise Data War...,Migrating data from a legacy on-prem data ware...
2,Data is at our core. It's what we're passionat...,"intelia was founded in 2018, with a desire to ..."
3,intelia has a unique technology partner ecosys...,In an increasingly fast-paced and competitive ...
4,? Celebrating Excellence at intelia! ?,"ast night, we came together as hashtag#onetrib..."
5,Direnc Uysal's linked-in post,Amidst the excitement surrounding Generative A...
6,intelia's post on 22rd december,"As 2024 comes to a close, the intelia hashtag#..."


In [45]:
from google.cloud import storage
import tempfile, shutil
import time
from datetime import datetime
import json
from google.cloud import bigquery 
import os 
import gcsfs
import pandas as pd

def upload_file(request_file : tempfile,dest_bucket_name:str =None,request_file_folder: str =None,request_file_prefix: str =None, version: int=0, request_file_post_fix : str=""):

    """upload file into gcs
   
        Args:
            tempfile request_file: request file
            str dest_bucket_name:  name of destination bucket
            str request_file_folder: name of the destination folder name to write files to
            list request_file_prefix: prefix of request file name
          
    """

    temp=request_file
    client = storage.Client()
    # Extract name to the temp file
    temp_file = "".join([str(temp.name)])
    # Uploading the temp image file to the bucket
    dest_filename = f"{request_file_folder}/"+request_file_prefix+'_'+request_file_post_fix+'_'+str(version)+".json" 
    dest_bucket = client.get_bucket(dest_bucket_name)
    dest_blob = dest_bucket.blob(dest_filename)
    dest_blob.upload_from_filename(temp_file)      

In [49]:
data

Unnamed: 0,title,content
0,intelia ethos,\t\n#delightthecustomer\n\nOur customers' succ...
1,Migrating Data from Legacy Enterprise Data War...,Migrating data from a legacy on-prem data ware...
2,Data is at our core. It's what we're passionat...,"intelia was founded in 2018, with a desire to ..."
3,intelia has a unique technology partner ecosys...,In an increasingly fast-paced and competitive ...
4,? Celebrating Excellence at intelia! ?,"ast night, we came together as hashtag#onetrib..."
5,Direnc Uysal's linked-in post,Amidst the excitement surrounding Generative A...
6,intelia's post on 22rd december,"As 2024 comes to a close, the intelia hashtag#..."


In [52]:
request_file = tempfile.NamedTemporaryFile(suffix=".json", delete=True) 
rf= open(request_file.name, "a") 

for idx,row in data.iterrows():
    data_items=[ json.dumps( {  "_id": idx,
                                "title":row['title'],
                                 "text":row['content']                                          
                               }
                               )  +"\n"
                ]

    rf.writelines(data_items)
    rf.flush()
upload_file(rf,dest_bucket_name='vlt_search_and_contet_output',request_file_folder='finetunning',request_file_prefix='fine_tuning_intelia_sample', version=0, request_file_post_fix ='0')


In [81]:
def generate_queries(model, title, text):

    prompt = (f'You are a human being who is tasked with extracting some questions from a given text.\n'
    f'Think throughly when reading this text including both title and content. Think of questions may people ask to know more about a data consultancy called Intelia.\n'
    f'Suggest 1 to 5 short questions and 2-3 labels or short expressions that are important to ask when people want to explore who Intelia is and what it does. These expressions or questions should be optimized for fine tuning a text embedding model.\n'    
    f'Output each response on a separate line divided by a newline.\n'
    f'Here is the text:\n'
    f'"title": {title}\n'
    f'"content":{text}')
    
    responses = model.generate_content(prompt, stream=False)
    return responses.text.split('\n')


def get_autorater_response(llm_model: str="gemini-1.5-pro", title="", text="", query="") -> dict:
        
        """Extract evaluation metric on a AI-generated content using a AI-as-judge approach
        
        Args:
        list metric_prompt: the input metric prompt parameters
        str llm_model: evaluation model

        Returns:
        dict response_json: the evaluated metric in json format
        """
            
        # set evaluation metric schema
        metric_response_schema = {
            "type": "OBJECT",
            "properties": {
                "score": {"type": "NUMBER"},
                "explanation": {"type": "STRING"},
            },
            "required": ["score", "explanation"],
        }     
        #define a generative model as an autorator
        autorater = GenerativeModel(
            llm_model,
            generation_config=GenerationConfig(
                response_mime_type="application/json",
                response_schema=metric_response_schema,
            ),
            safety_settings={
                HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            },
        )

        metric_prompt=  (f'You are a human being who is tasked with finding the answer of a given question within an aricle.\n'
                        f'Read the given question and analyse the given artcile to see if you can find the answer to the question within this aricle.\n'
                        f'Use this rating rubric to give a relevancy score for your analysis:\n'
                        f'"Rating Rubric:\n'
                        f'0: No relevancy. The document has no connection to the question. It lacks relevant keywords, topics, or context. \n'
                        f'1: Low Relevance. The document contains some keywords but does not provide meaningful information related to the question. The context is weak or unrelated. \n'
                        f'2: Moderate Relevance. The document partially addresses the question but lacks completeness, clarity, or depth. It may require inference to extract an answer.\n'
                        f'3	High Relevance. The document directly and clearly answers the question with specific, well-aligned information. Strong contextual alignment.\n'

                        f'Question:\n'
                        f'{query}\n'                      
                        f'Article:\n'                    
                        f'"title": {title}\n'
                        f'"content":{text}')
    
        #generate the rating metrics as per requested measures and metric in the prompt
        response = autorater.generate_content([metric_prompt])

        response_json = {}

        if response.candidates and len(response.candidates) > 0:
            candidate = response.candidates[0]
            if (
                candidate.content
                and candidate.content.parts
                and len(candidate.content.parts) > 0
            ):
                part = candidate.content.parts[0]
                if part.text:
                    response_json = json.loads(part.text)

        return response_json

In [63]:
generated_queries=[]
request_file = tempfile.NamedTemporaryFile(suffix=".json", delete=True) 
rf= open(request_file.name, "a") 

for idx,row in data.iterrows():
    queries=generate_queries(model, row['title'], row['content']) 
    print(queries)
    

["What is Intelia's core philosophy and how does it impact their approach to customer success?", '', 'How does Intelia balance its focus on customer delight with the well-being of its employees?', '', 'What specific actions does Intelia take to foster a passionate and enjoyable work environment?', '', 'How does Intelia leverage diversity and collaboration to achieve its goals?', '', 'What sets Intelia apart from other data consultancies in the market?']

['## Questions about Intelia:', '', '1. **What specific types of data migrations does Intelia specialize in?**', '', '2. **Does Intelia offer end-to-end data migration services, including planning, execution, and post-migration support?**', '', '3. **What cloud platforms is Intelia most experienced with and certified in?**', '', '4. **Can Intelia provide case studies or examples of successful data migration projects they have completed?**', '', "5. **What is Intelia's pricing model for data migration services?**"]

['## Intelia: Key Qu

In [71]:
q1=["What is Intelia's core philosophy and how does it impact their approach to customer success?",  'How does Intelia balance its focus on customer delight with the well-being of its employees?',  'What specific actions does Intelia take to foster a passionate and enjoyable work environment?',  'How does Intelia leverage diversity and collaboration to achieve its goals?',  'What sets Intelia apart from other data consultancies in the market?']

q2=['## Questions about Intelia:',  '1. **What specific types of data migrations does Intelia specialize in?**',  '2. **Does Intelia offer end-to-end data migration services, including planning, execution, and post-migration support?**',  '3. **What cloud platforms is Intelia most experienced with and certified in?**',  '4. **Can Intelia provide case studies or examples of successful data migration projects they have completed?**',  "5. **What is Intelia's pricing model for data migration services?**"]

q3=['## Intelia: Key Questions',  '1. **What specific data-driven opportunities does Intelia help its clients unlock?** ', '2. **Beyond data analysis, what other services does Intelia provide to its clients?**', "3. **Can you elaborate on Intelia's unique approach to data consultancy?**", '4. **How does Intelia tailor its services to address the specific challenges of each client?**', '5. **What are some examples of successful projects Intelia has completed for its clients?** ']

q4=['What types of businesses does Intelia typically work with?', 'What makes Intelia different from other cloud data platform providers?', "How does Snowflake's cloud-first architecture benefit businesses?", 'How can Intelia assist with planning a cloud or multi-cloud strategy? ', 'How does Intelia help clients focus on business value rather than data warehousing complexity']

q5=['## Questions about Intelia:',  '1. What specific data consultancy services does Intelia offer?', '2. What kind of impact does Intelia make for its customers?', '3. How does Intelia foster a culture of innovation and excellence within its team?', "4. What are Intelia's plans for the future, especially in the context of the evolving AI and data landscape?", "5. What are some examples of projects or achievements that demonstrate Intelia's expertise and success? "]

q6=['1. What specific data governance services does Intelia offer to help businesses succeed with their machine learning solutions? ', '2. How does Intelia ensure compliance and facilitate collaboration when implementing data governance strategies?', '3. Can Intelia provide examples of how they have helped businesses improve their data management and avoid the need for significant changes down the line?', '4. What are the key benefits of partnering with Intelia for data governance, particularly in the context of Generative AI adoption?', '5. Does Intelia offer tailored solutions for different industries or specific data-driven challenges? ', '']

q7=['- What services does Intelia provide?', "- What are some of Intelia's major accomplishments in 2024?", "- What are some of Intelia's core values?", "- What are Intelia's goals for 2025?", '- What is the meaning of the hashtag "#onetribe"?']

queries=q1+q2+q3+q4+q5+q6+q7

qid=0
for query in queries:
        if query!="":
            generated_queries=[ json.dumps( {  "_id": qid,  
                                               "text":query                                      
                                         }
                                       )  +"\n"
                        ]
        
            rf.writelines(generated_queries)
            rf.flush()
            qid +=1

upload_file(rf,dest_bucket_name='vlt_search_and_contet_output',request_file_folder='finetunning',request_file_prefix='fine_tuning_query_sample', version=0, request_file_post_fix ='0')

    

In [79]:
qid=0
relevancy=[]

import time

try:
    for query in queries:
        for idx,row in data.iterrows():
            r=get_autorater_response(title=row['title'], text=row['content'], query=query) 
            relevancy.append([qid,idx,r.get("score", "")])
            time.sleep(2)
        qid=qid+1
except:
    print(qid)


# Create DataFrame with column names
df = pd.DataFrame(data, columns=["ID", "Name", "Age"])

# Display the DataFrame
print(df)

    query-id  corpus-id   score
        

gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5-pro
gemini-1.5

ServiceUnavailable: 503 502:Bad Gateway