In [None]:
!pip install pandas nltk rouge-score scikit-learn

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=5a0546956584c4920be1fa061ddba8af708526181a3d26d96d9b74839fed5d8d
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import Counter
import zipfile
import os

# --- Custom Functions for Metrics Calculation ---

def calculate_exact_match(ground_truth, predicted):
    """Calculates if the predicted response is an exact match."""
    return ground_truth.strip() == predicted.strip()

def calculate_bleu(ground_truth, predicted):
    """Calculates a simple BLEU score using a basic tokenizer."""
    reference = ground_truth.lower().split()
    candidate = predicted.lower().split()

    if not candidate:
        return 0.0

    candidate_counts = Counter(candidate)
    reference_counts = Counter(reference)

    clipped_counts = {word: min(candidate_counts[word], reference_counts.get(word, 0)) for word in candidate_counts}
    precision = sum(clipped_counts.values()) / sum(candidate_counts.values()) if sum(candidate_counts.values()) > 0 else 0.0

    brevity_penalty = 1.0
    if len(candidate) < len(reference):
        brevity_penalty = np.exp(1 - len(reference) / len(candidate)) if len(candidate) > 0 else 0.0

    return brevity_penalty * precision


def calculate_rouge(ground_truth, predicted):
    """Calculates ROUGE-like scores manually without external libraries."""
    from collections import Counter

    def get_ngrams(text, n):
        words = text.lower().split()
        return Counter(tuple(words[i:i+n]) for i in range(len(words) - n + 1))

    def calculate_f1(pred_ngrams, ref_ngrams):
        if not pred_ngrams:
            return 0.0

        overlap = sum(min(pred_ngrams[ngram], ref_ngrams.get(ngram, 0)) for ngram in pred_ngrams)
        precision = overlap / sum(pred_ngrams.values()) if sum(pred_ngrams.values()) > 0 else 0.0
        recall = overlap / sum(ref_ngrams.values()) if sum(ref_ngrams.values()) > 0 else 0.0

        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)

    # ROUGE-L (Longest Common Subsequence)
    def calculate_rouge_l(pred_words, ref_words):
        m, n = len(pred_words), len(ref_words)
        dp = [[0] * (n + 1) for _ in range(m + 1)]
        for i in range(m):
            for j in range(n):
                if pred_words[i] == ref_words[j]:
                    dp[i+1][j+1] = dp[i][j] + 1
                else:
                    dp[i+1][j+1] = max(dp[i+1][j], dp[i][j+1])
        lcs = dp[m][n]
        precision = lcs / m if m > 0 else 0
        recall = lcs / n if n > 0 else 0

        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)

    ground_truth_words = ground_truth.lower().split()
    predicted_words = predicted.lower().split()

    rouge1_f1 = calculate_f1(get_ngrams(ground_truth, 1), get_ngrams(predicted, 1))
    rouge2_f1 = calculate_f1(get_ngrams(ground_truth, 2), get_ngrams(predicted, 2))
    rougel_f1 = calculate_rouge_l(ground_truth_words, predicted_words)

    return {
        'rouge1_f1': rouge1_f1,
        'rouge2_f1': rouge2_f1,
        'rougel_f1': rougel_f1
    }


def calculate_cosine_similarity(ground_truth_list, predicted_list):
    """
    Calculates cosine similarity for all pairs using TF-IDF.
    """
    vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b').fit(ground_truth_list + predicted_list)
    gt_vectors = vectorizer.transform(ground_truth_list)
    pred_vectors = vectorizer.transform(predicted_list)

    similarities = []
    for i in range(len(ground_truth_list)):
        sim = cosine_similarity(gt_vectors[i:i+1], pred_vectors[i:i+1])
        similarities.append(sim[0][0])
    return similarities

# --- 1. Load the data ---
try:
    df = pd.read_csv("final_results.csv")
    print("\nData loaded successfully.")
except FileNotFoundError:
    print("\nError: 'final_results.csv' not found. Please ensure the file is in the same directory as the script.")
    exit()

# --- 2. Apply the metrics and store results ---
print("Calculating per-row metrics...")
metrics_df = pd.DataFrame()
metrics_df['class'] = df['class']
metrics_df['subject'] = df['subject']
metrics_df['Exact_Match'] = df.apply(
    lambda row: calculate_exact_match(row['Ground Truth Response'], row['Predicted Response']),
    axis=1
).astype(int)
metrics_df['BLEU'] = df.apply(
    lambda row: calculate_bleu(row['Ground Truth Response'], row['Predicted Response']),
    axis=1
)
rouge_scores = df.apply(
    lambda row: calculate_rouge(row['Ground Truth Response'], row['Predicted Response']),
    axis=1
).tolist()
rouge_df = pd.DataFrame(rouge_scores)
metrics_df = pd.concat([metrics_df, rouge_df], axis=1)
metrics_df['Cosine_Similarity'] = calculate_cosine_similarity(
    df['Ground Truth Response'].fillna('').tolist(),
    df['Predicted Response'].fillna('').tolist()
)

# --- 3. Calculate and save overall metrics CSVs based on user request ---
metrics_columns = ['Exact_Match', 'BLEU', 'rouge1_f1', 'rouge2_f1', 'rougel_f1', 'Cosine_Similarity']
csv_files_to_zip = []

# Group metrics by class and subject
class_subject_metrics = metrics_df.groupby(['class', 'subject'])[metrics_columns].mean().reset_index()

# Save a separate CSV for each class with its subject metrics
print("\nSaving metrics for each class (subject-wise breakdown)...")
for cls in class_subject_metrics['class'].unique():
    class_df = class_subject_metrics[class_subject_metrics['class'] == cls].set_index('subject')
    filename = f"{cls}_metrics.csv"
    class_df.to_csv(filename)
    csv_files_to_zip.append(filename)
    print(f"  Saved: {filename}")

# Save overall metrics by subject
print("\nSaving overall metrics by subject...")
subject_metrics = metrics_df.groupby('subject')[metrics_columns].mean()
output_filename_subject = "overall_subject_metrics.csv"
subject_metrics.to_csv(output_filename_subject)
csv_files_to_zip.append(output_filename_subject)
print(f"  Saved: {output_filename_subject}")

print(f"\nAll requested aggregated metrics files have been generated.")

# --- 4. Zip the generated CSV files ---
zip_filename = "class_subject_metrics.zip"
print(f"\nZipping all generated CSV files into '{zip_filename}'...")
with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
    for file in csv_files_to_zip:
        if os.path.exists(file):
            zf.write(file, os.path.basename(file))
        else:
            print(f"Warning: File not found and skipped during zipping: {file}")
print(f"\nAll generated CSV files have been zipped into '{zip_filename}'.")

# --- 5. Display previews of the results ---
print("\n--- Example: Metrics for Class 1 (Subject-wise) ---")
# Check if 'Class1' exists in the data before trying to print
if 'Class1' in class_subject_metrics['class'].unique():
    class1_metrics = class_subject_metrics[class_subject_metrics['class'] == 'Class1'].set_index('subject')
    print(class1_metrics.to_markdown(numalign="left", stralign="left", floatfmt=".2f"))
else:
    print("Class1 data not available in the input file for preview.")

print("\n--- Overall Metrics by Subject ---")
print(subject_metrics.to_markdown(numalign="left", stralign="left", floatfmt=".2f"))



Data loaded successfully.
Calculating per-row metrics...

Saving metrics for each class (subject-wise breakdown)...
  Saved: Class1_metrics.csv
  Saved: Class10_metrics.csv
  Saved: Class2_metrics.csv
  Saved: Class3_metrics.csv
  Saved: Class4_metrics.csv
  Saved: Class5_metrics.csv
  Saved: Class6_metrics.csv
  Saved: Class7_metrics.csv
  Saved: Class8_metrics.csv
  Saved: Class9_metrics.csv

Saving overall metrics by subject...
  Saved: overall_subject_metrics.csv

All requested aggregated metrics files have been generated.

Zipping all generated CSV files into 'class_subject_metrics.zip'...

All generated CSV files have been zipped into 'class_subject_metrics.zip'.

--- Example: Metrics for Class 1 (Subject-wise) ---
| subject   | class   | Exact_Match   | BLEU   | rouge1_f1   | rouge2_f1   | rougel_f1   | Cosine_Similarity   |
|:----------|:--------|:--------------|:-------|:------------|:------------|:------------|:--------------------|
| english   | Class1  | 0.40          | 0.