# Week 8 — Multimodal LLM with Metrics

**Tracks Completed:** A (Speech)  and  B (Visualization)


### Set Ups

In [None]:

!pip install -q SpeechRecognition gTTS transformers torch pandas matplotlib sentence-transformers jiwer

import os, time, json, shutil
import pandas as pd
import numpy as np
import speech_recognition as sr

from gtts import gTTS
from IPython.display import Audio, display
import torch, matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer, util
from jiwer import wer



### Model Loading & Helper Functions

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Running on:", device)

# Load TinyLlama
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    low_cpu_mem_usage=True
).to(device)


def run_llm(prompt):
    """
    Generates an answer using TinyLlama in chat format.
    """
    template = f"<|user|>\n{prompt}\n<|assistant|>\n"
    inputs = tokenizer.encode(template, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.2,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the input part so only the model's answer remains
    if "<|assistant|>" in response:
        response = response.split("<|assistant|>")[-1].strip()
    return response



# --- Speech helper functions ---
def speech_to_text(audio_path):
    r = sr.Recognizer()
    with sr.AudioFile(audio_path) as s:
        audio = r.record(s)
    return r.recognize_google(audio)


def text_to_speech(text, out_path="audio_outputs/reply.mp3"):
    os.makedirs("audio_outputs", exist_ok=True)
    gTTS(text).save(out_path)
    return out_path




### Baseline Evaluation Cell

In [None]:


import os, time, pandas as pd, numpy as np
from sentence_transformers import SentenceTransformer, util

# --- Load data ---
df = pd.read_csv("text_inputs.csv")  # must have 'question' and 'gold_answes'
semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

results = []

# --- Loop through each question ---
for i, row in df.iterrows():
    q = row["question"]
    gold = row["gold_answers"]

    print(f"\nQ{i+1}: {q}")
    start = time.time()
    reply = run_llm(q)
    latency = round(time.time() - start, 2)

    # Compute semantic accuracy per question
    sim = util.cos_sim(
        semantic_model.encode(gold, convert_to_tensor=True),
        semantic_model.encode(reply, convert_to_tensor=True)
    ).item()
    semantic_accuracy = round(sim, 3)

    # Add ablation-style columns (baseline placeholders)
    record = {
        "Variant": "Baseline_TextOnly",
        "Question": q,
        "Gold Answer": gold,
        "Reply": reply,
        "Latency (s)": latency,
        "Accuracy": semantic_accuracy,
        "Visualization Quality": "N/A",
        "Speech Accuracy": "N/A",
        "Notes": "Text-only TinyLlama reasoning"
    }

    results.append(record)
    print(f"A: {reply}")

# --- Save final output ---
os.makedirs('text_outputs', exist_ok=True)
output_path = "text_outputs/baseline_full_metrics.csv"
pd.DataFrame(results).to_csv(output_path, index=False)


# --- Summary view ---
df_text = pd.DataFrame(results)
avg_latency = round(df_text["Latency (s)"].mean(), 2)
avg_sem_acc = round(df_text["Accuracy"].mean(), 2)
print(f"\nAverage Latency: {avg_latency}s")
print(f"Average Accuracy: {avg_sem_acc}")




### Speech Evaluation Cell (Track A)

In [None]:


# --- Load reference text and answers (baseline CSV) ---
ref_df = pd.read_csv("text_inputs.csv")  # must contain 'question' and 'gold_answer'
questions = ref_df["question"].tolist()
gold_answers = ref_df["gold_answers"].tolist()

# --- Load semantic model (rename to avoid conflict with LLM model) ---
semantic_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

speech_results = []

# --- Iterate through your audio folder ---
for idx, audio_file in enumerate(sorted(os.listdir("audio_inputs"))):
    if not audio_file.endswith(".wav"):
        continue

    path = f"audio_inputs/{audio_file}"
    print(f"\nProcessing: {audio_file}")

    # Speech → Text (transcription)
    start = time.time()
    try:
        transcription = speech_to_text(path)
    except Exception as e:
        print("Speech recognition failed:", e)
        continue

    # LLM reasoning
    reply = run_llm(transcription)
    latency = round(time.time() - start, 2)

    # Text → Speech (save spoken reply)
    output_file = f"audio_outputs/reply_{os.path.splitext(audio_file)[0]}.mp3"
    text_to_speech(reply, out_path=output_file)

    print("Reply:", reply)
    display(Audio(output_file, autoplay=False))

    #  Compute Speech Accuracy (1 − WER between expected question and transcript)
    #  If order of audios matches text_inputs.csv questions
    ref_text = questions[idx]
    speech_accuracy = round(1 - wer(ref_text.lower(), transcription.lower()), 3)

    # Compute Semantic Accuracy (LLM output vs gold answer)
    gold = gold_answers[idx]
    sim = util.cos_sim(
        semantic_model.encode(gold, convert_to_tensor=True),
        semantic_model.encode(reply, convert_to_tensor=True)
    ).item()
    semantic_acc = round(sim, 3)

    # Store metrics
    record = {
        "Variant": "With_Speech",
        "Question": ref_text,
        "Gold Answer": gold,
        "Reply": reply,
        "Latency (s)": latency,
        "Accuracy": semantic_acc,
        "Visualization Quality": "N/A",
        "Speech Accuracy": speech_accuracy,
        "Notes": "Speech→STT→TinyLlama→TTS pipeline"
    }
    speech_results.append(record)

# --- Save full metrics ---
speech_path = "audio_outputs/audio_full_metrics.csv"
pd.DataFrame(speech_results).to_csv(speech_path, index=False)


# --- Summary view ---
df_speech = pd.DataFrame(speech_results)
avg_latency = round(df_speech["Latency (s)"].mean(), 2)
avg_sem_acc = round(df_speech["Accuracy"].mean(), 2)
avg_speech_acc = round(df_speech["Speech Accuracy"].mean(), 2)
print(f"\nAverage Speech Latency: {avg_latency}s")
print(f"Average Semantic Accuracy: {avg_sem_acc}")
print(f"Average Speech Accuracy (STT): {avg_speech_acc}")



### Visualization Evaluation Cell (Track B)

In [None]:


os.makedirs("visual_outputs", exist_ok=True)

# --- Step 1: Load dataset (must have game_number, accuracy, loss, moves_analyzed)
df = pd.read_csv("visual_inputs.csv")
print('\nView Dataset')
display(df.head(3))
print(f'\n\n')


# --- Step 2: NL → Plot Specification ---
def nl_to_plot_spec(query):
    q = query.lower()
    if "accuracy" in q and "loss" in q:
        return {"x": "game_number", "y": ["accuracy", "loss"]}
    elif "accuracy" in q:
        return {"x": "game_number", "y": "accuracy"}
    elif "loss" in q:
        return {"x": "game_number", "y": "loss"}
    elif "moves" in q:
        return {"x": "game_number", "y": "moves_analyzed"}
    elif "moves" in q:
        return {"x": "loss", "y":"moves_analyzed"}
    else:
        return {"x": "game_number", "y": "accuracy"}


# --- Step 3: Plotting function ---
def plot_from_spec(spec, title_suffix=""):
    plt.figure(figsize=(6, 4))
    if isinstance(spec["y"], list):
        for y_key in spec["y"]:
            plt.plot(df[spec["x"]], df[y_key], label=y_key.capitalize())
        plt.legend()
    else:
        plt.plot(df[spec["x"]], df[spec["y"]])
    plt.xlabel(spec["x"].replace("_", " ").title())
    plt.ylabel(
        ", ".join(spec["y"]) if isinstance(spec["y"], list) else spec["y"].title()
    )
    title = f"{', '.join(spec['y']) if isinstance(spec['y'], list) else spec['y'].title()} vs {spec['x'].title()} {title_suffix}"
    plt.title(title)
    path = f"visual_outputs/{title.replace(' ', '_').lower()}.png"
    plt.savefig(path)
    plt.close()
    return path

# --- Step 4: Visualization accuracy (keyword overlap) ---
def viz_accuracy(query, spec):
    q = query.lower()
    if isinstance(spec["y"], list):
        hits = sum(1 for v in spec["y"] if v in q)
        return round(hits / len(spec["y"]), 2)
    hits = 1 if spec["y"] in q else 0
    return round(hits, 2)

# --- Step 5: Visualization quality metric ---
def viz_quality(acc):
    if acc >= 0.9:
        return "Very Good"
    elif acc >= 0.75:
        return "Good"
    elif acc >= 0.5:
        return "Fair"
    else:
        return "Poor"

# --- Step 6: Natural-language queries ---
viz_queries = [
    "Plot move-prediction accuracy by game number",
    "Show how loss throughout the game",
    "Graph the accuracy and loss throughout the game"
]

# --- Step 7: Run all queries and store full metrics ---
results = []
for q in viz_queries:
    start = time.time()
    spec = nl_to_plot_spec(q)
    path = plot_from_spec(spec)
    latency = round(time.time() - start, 2)
    acc = viz_accuracy(q, spec)
    quality = viz_quality(acc)

    record = {
        "Variant": "With_Visualization",
        "Question": q,
        "Gold Answer": "N/A",
        "Reply": "N/A",
        "Latency (s)": latency,
        "Accuracy": acc,
        "Visualization Quality": quality,
        "Speech Accuracy": "N/A",
        "Notes": "NL→Plot Spec → Matplotlib Chart",
    }
    results.append(record)
    print(f"\nQuery: {q}\n  * Spec: {spec}\n  * Chart saved: {path}")


# --- Step 8: Save summary CSV ---
df_visual = pd.DataFrame(results)
viz_path = "visual_outputs/visualization_full_metrics.csv"
df_visual.to_csv(viz_path, index=False)


# --- Step 9: Summary view ---
avg_latency = round(df_visual["Latency (s)"].mean(), 2)
avg_acc = round(df_visual["Accuracy"].mean(), 2)
print(f"\n\n\nAverage Visualization Latency: {avg_latency}s")
print(f"Average Visualization Accuracy: {avg_acc}\n\n")



### Build Ablation Table



In [None]:

df_full = pd.concat([df_text, df_speech, df_visual], ignore_index=True)

ablation=pd.DataFrame(df_full)
ablation.to_csv("week8_ablation_results.csv",index=False)
ablation
