<a href="https://colab.research.google.com/github/jordan-dsouza/Fynd_Assignment/blob/main/.../Task1/Fynd_AI_Intern_Task1_JordanDSouza.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing dependencies:

In [None]:
!pip install transformers torch tqdm --quiet

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from sklearn.metrics import accuracy_score

# Downloading dataset via Kaggle API:

In [None]:
# Upload kaggle.json
from google.colab import files
files.upload()  # Choose the kaggle.json file

# Make a directory and move the file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Change permissions
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset:
!kaggle datasets download -d "omkarsabnis/yelp-reviews-dataset"

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/omkarsabnis/yelp-reviews-dataset
License(s): other
Downloading yelp-reviews-dataset.zip to /content
  0% 0.00/3.49M [00:00<?, ?B/s]
100% 3.49M/3.49M [00:00<00:00, 455MB/s]


In [None]:
# Unzip the dataset
!unzip yelp-reviews-dataset.zip

Archive:  yelp-reviews-dataset.zip
  inflating: yelp.csv                


In [None]:
df = pd.read_csv("/content/yelp.csv")
# Sampling 200 rows:
df = df.sample(200, random_state=42).reset_index(drop=True)
df = df.rename(columns={"text": "review_text", "stars": "stars"})
df.head()

Unnamed: 0,business_id,date,review_id,stars,review_text,type,user_id,cool,useful,funny
0,QVR7dsvBeg8xFt9B-vd1BA,2010-07-22,hwYVJs8Ko4PMjI19QcR57g,4,We got here around midnight last Friday... the...,review,90a6z--_CUrl84aCzZyPsg,5,5,2
1,24qSrF_XOrvaHDBy-gLIQg,2012-01-22,0mvthYPKb2ZmKhCADiKSmQ,5,Brought a friend from Louisiana here. She say...,review,9lJAj_2zCvP2jcEiRjF9oA,0,0,0
2,j0Uc-GuOe-x9_N_IK1KPpA,2009-05-09,XJHknNIecha6h0wkBSZB4w,3,"Every friday, my dad and I eat here. We order ...",review,0VfJi9Au0rVFVnPKcJpt3Q,0,0,0
3,RBiiGw8c7j-0a8nk35JO3w,2010-12-22,z6y3GRpYDqTznVe-0dn--Q,1,"My husband and I were really, really disappoin...",review,lwppVF0Yqkuwt-xaEuugqw,2,2,2
4,U8VA-RW6LYOhxR-Ygi6eDw,2011-01-17,vhWHdemMvsqVNv5zi2OMiA,5,Love this place! Was in phoenix 3 weeks for w...,review,Y2R_tlSk4lTHiLXTDsn1rg,0,1,0


# HuggingFace Model:

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def hf_call(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512 ).to(device)
    outputs = model.generate(**inputs, max_new_tokens=60)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Prompts:

## Zero-Shot Prompt:
 * A zero-shot prompt is a direct instruction or question given to a large language model (LLM) without providing any prior examples of the task.
 * Purpose: Baseline performance
 * WHY:
 >* Tests raw LLM capability
 >* Often weaker consistency and JSON errors

In [None]:
def prompt_v1(review):
    return f"""
You are an assistant that classifies Yelp reviews.

Given the review below, predict a star rating from 1 to 5.

Return only JSON in this exact format:

{{"predicted_stars": <1-5>, "explanation": "<short reasoning>"}}
Review:
"{review}"

Output format:
{{
  "predicted_stars": 4,
  "explanation": "Brief reasoning."
}}
"""


## Rubric-Guided Prompt:
A rubric-guided prompt is a prompt engineering technique where explicit evaluation criteria, "rubric", are included within the instructions given to a large language model (LLM).
* Improvements: Adds explicit criteria.
* WHY:
>* Improves rating alignment
>* Reduces ambiguity

In [None]:
def prompt_v2(review):
    return f"""
You are a sentiment analysis expert.

Classify the review using this rubric:
1 star: Very negative, strong dissatisfaction
2 stars: Mostly negative
3 stars: Mixed or neutral
4 stars: Mostly positive, minor complaints
5 stars: Very positive, strong praise

Return only JSON in this exact format:

{{"predicted_stars": <1-5>, "explanation": "<short reasoning>"}}
Review:
"{review}"

Return ONLY valid JSON:
{{
  "predicted_stars": 1-5,
  "explanation": "Short reasoning"
}}
"""

## Reasoned Prompt:
A reasoned prompt is a specialized instruction for AI that encourages step-by-step logical thinking (reasoning) rather than just pattern matching.
* Improvement: Better reasoning consistency
* WHY:
>* Encourages internal reasoning
>* Often improves accuracy without verbose output

In [None]:
def prompt_v3(review):
    return f"""
Analyze the sentiment step by step, then assign a rating.
Do not show your steps.


Return only JSON in this exact format:

{{"predicted_stars": <1-5>, "explanation": "<short reasoning>"}}

Review:
"{review}"

Return JSON only:
{{
  "predicted_stars": 1-5,
  "explanation": "Concise explanation"
}}
"""


In [None]:
prompts = [prompt_v1, prompt_v2, prompt_v3]

# JSON Parsed:
Parse output

In [None]:
def parse_json(text):
    try:
        # First, try strict JSON
        data = json.loads(text)
        valid = "predicted_stars" in data
        return data, valid
    except:
        # If text like "3 stars", extract number
        import re
        match = re.search(r'([1-5])', text)
        if match:
            return {"predicted_stars": int(match.group(1)), "explanation": text}, True
        else:
            return {}, False


# Evaluation:

In [None]:
def evaluate_prompt(prompt_fn, df):
    preds, valids, consistency = [], [], []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        prompt = prompt_fn(row["review_text"])
        output = hf_call(prompt)
        data, valid = parse_json(output)
        valids.append(valid)

        # If invalid, assign default 0
        preds.append(int(data.get("predicted_stars", 0)) if valid else 0)
        consistency.append(True)

    df_eval = df.copy()
    df_eval["predicted"] = preds

    # Only drop rows where predicted is zero
    df_eval = df_eval[df_eval["predicted"] != 0]

    accuracy = accuracy_score(df_eval["stars"], df_eval["predicted"]) if not df_eval.empty else 0.0
    json_valid_rate = np.mean(valids) if valids else 0.0
    consistency_rate = np.mean(consistency) if consistency else 0.0

    return accuracy, json_valid_rate, consistency_rate


# Run Prompts:
* Accuracy, Validity, Consistency

In [None]:
results = []
for i, prompt_fn in enumerate(prompts):
    acc, valid, cons = evaluate_prompt(prompt_fn, df)
    results.append([i, prompt_fn.__name__, acc, valid, cons])

100%|██████████| 200/200 [01:06<00:00,  3.02it/s]
100%|██████████| 200/200 [01:13<00:00,  2.73it/s]
100%|██████████| 200/200 [01:04<00:00,  3.09it/s]


# Result Table:

In [None]:
results_df = pd.DataFrame(results, columns=["Prompt Version", "Name", "Accuracy", "JSON Validity Rate", "Consistency"])
results_df

Unnamed: 0,Prompt Version,Name,Accuracy,JSON Validity Rate,Consistency
0,0,prompt_v1,0.603175,0.315,1.0
1,1,prompt_v2,0.427136,0.995,1.0
2,2,prompt_v3,0.0,0.0,1.0


# Quick Tests:

In [None]:
test_prompt = prompt_v1(df.iloc[0]["review_text"])
raw_output = hf_call(test_prompt)
data, valid = parse_json(raw_output)
print(data, valid)


{'predicted_stars': 4, 'explanation': '4'} True


In [None]:
test_prompt = prompt_v2(df.iloc[0]["review_text"])
raw_output = hf_call(test_prompt)
data, valid = parse_json(raw_output)
print(data, valid)


{'predicted_stars': 4, 'explanation': '4'} True


In [None]:
test_prompt = prompt_v3(df.iloc[0]["review_text"])
raw_output = hf_call(test_prompt)
data, valid = parse_json(raw_output)
print(data, valid)


{} False
