In [1]:
import json
import pandas as pd
import os
import google.generativeai as genai
import types
from dotenv import load_dotenv
from tqdm import tqdm
import os
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import Dataset
import random
from collections import defaultdict

In [14]:
# Step 1: Load environment variables and configure Gemini
load_dotenv()
api_key = os.getenv('API_KEY')
genai.configure(api_key=api_key)
base_model = [
    m for m in genai.list_models()
    if "createTunedModel" in m.supported_generation_methods and
    "flash" in m.name][0]
base_model
model = genai.GenerativeModel('gemini-1.5-flash')
model


genai.GenerativeModel(
    model_name='models/gemini-1.5-flash',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

In [None]:
# Print out the dataset columns to verify its structure.
json_file = os.path.join("2024_dataset", "train_downloaded.json")
with open(json_file, 'r', encoding='utf-8') as f:
    data = json.load(f)
len(data)
# data

In [13]:
dataset = []
for values in data:

    temp = defaultdict(str)
    temp['text_input'] = values['query_content_en'] if values['query_content_en'] != '[deleted]' and values['query_content_en'] else values['query_title_en']
    temp['output'] = values['responses'][0]['content_en']
    dataset.append(temp)


In [19]:
prompt = """
Answer the question based on the context

Context: I will give you two sentences. The first sentence will be a generated response. The second sentence is the baseline response. 
I want you to judge how close these sentences are to eachother. Return a continuous value between [0,1] inclusive where 0 is a fully incorrect response and 1 is a fully correct response. Also provide chain of thought.

Generated Response: This sounds like eczema to me.
Baseline Response: This does not sound like eczema to me.
"""

response = model.generate_content(prompt, generation_config={'temperature': 0})
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Score: 0.0\n\nChain of thought: The generated response and the baseline response are diametrically opposed.  They express completely contradictory opinions.  There is no overlap or similarity between \"This sounds like eczema\" and \"This does not sound like eczema.\"  Therefore, the similarity score must be 0.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.04349698081161037
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 109,
        "candidates_token_count": 66,
        "total_token_count": 175
      }
    }),
)

In [20]:
prompt = """
Answer the question based on the context

Context: I will give you two sentences. The first sentence will be a generated response. The second sentence is the baseline response. 
I want you to judge how close these sentences are to eachother. Return a continuous value between [0,1] inclusive where 0 is a fully incorrect response and 1 is a fully correct response. Also provide chain of thought.

Generated Response: This sounds like eczema to me.
Baseline Response: This like eczema to me.
"""

response = model.generate_content(prompt, generation_config={'temperature': 0})
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Similarity score: 0.8\n\nChain of thought:\n\nBoth sentences express the same opinion (\"This sounds/looks like eczema to me\").  The generated response is grammatically correct, while the baseline response omits the word \"sounds,\" making it slightly less formal but still conveying the same meaning. The difference is minor, primarily in grammatical correctness and word choice (\"sounds\" vs. \"looks\").  The core meaning remains identical.  Therefore, a score of 0.8 reflects the high degree of semantic similarity despite the minor grammatical difference.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.13209404816498627
        }
      ],
      "usage_metadata": {
   

In [24]:
prompt = """
Answer the question based on the context

Context: I will give you two sentences. The first sentence will be a generated response. The second sentence is the baseline response. 
I want you to judge how contextually similar these sentences are to eachother. Return a continuous value between [0,1] inclusive where 0 is a fully incorrect response and 1 is a fully correct response. Also provide chain of thought. Use only context to judge how similar these two sentences are.

Generated Response: This sounds like eczema to me.
Baseline Response: This like eczema to me.
"""

response = model.generate_content(prompt, generation_config={'temperature': 0})
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Similarity score: 0.8\n\nChain of thought:\n\nBoth sentences express the same opinion: that the described symptoms sound like eczema.  The generated response is grammatically correct (\"This sounds like eczema to me\"), while the baseline response omits the \"s\" in \"sounds,\" making it slightly less grammatically correct (\"This like eczema to me\"). However, the core meaning and intent remain identical. The grammatical error in the baseline response is minor and doesn't significantly alter the meaning.  Therefore, a high similarity score is appropriate.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.1002615668556907
        }
      ],
      "usage_metadata": {
    

In [25]:
prompt = """
Answer the question based on the context

Context: I will give you two sentences. The first sentence will be a generated response. The second sentence is the baseline response. 
I want you to judge how contextually similar these sentences are to eachother. Return a continuous value between [0,1] inclusive where 0 is a fully incorrect response and 1 is a fully correct response. Also provide chain of thought. Use only context to judge how similar these two sentences are.

Generated Response: This sounds like eczema to me.
Baseline Response: Not 100% certain on the prognosis, but this seems like it would be related to dry skin.
"""

response = model.generate_content(prompt, generation_config={'temperature': 0})
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "Contextual Similarity Score: 0.6\n\nChain of thought:\n\nBoth sentences offer a potential diagnosis based on limited information (presumably about a skin condition).  The generated response is more specific (\"eczema\"), while the baseline response is more cautious and general (\"dry skin\").  However, both point towards a skin-related issue.  Eczema is a condition often associated with dry skin, so there's a degree of overlap.  The difference in certainty (definitive vs. tentative) reduces the similarity score from a perfect 1.  A score of 0.6 reflects the partial overlap in meaning while acknowledging the difference in specificity and confidence.\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
 

In [26]:
prompt = """
Answer the question based on the context

Context: I will give you two sentences. The first sentence will be a generated response. The second sentence is the baseline response. 
Does the diagnosis in the generated response match the baseline response? Please provide a similarity score between [0,1] inclusive.

Generated Response: This sounds like eczema to me.
Baseline Response: Not 100% certain on the prognosis, but this seems like it would be related to dry skin.
"""

response = model.generate_content(prompt, generation_config={'temperature': 0})
response


response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "The generated response and baseline response don't directly match.  While eczema is a condition related to dry skin, they aren't synonymous.  The generated response offers a specific diagnosis, while the baseline response suggests a possible underlying cause.  Therefore, the similarity score is low.\n\nSimilarity score: **0.3**\n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "avg_logprobs": -0.09079547426593837
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 101,
        "candidates_token_count": 67,
        "total_token_count": 168
      }
    }),
)