# Final Project Codebook - Evaluation

Aditya Kumar, Matthew Shull and Irina Lee

In [None]:
# Importing required libraries
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import torch
import json
import evaluate
from rouge_score import rouge_scorer

# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import fot GPT-4
!pip uninstall -q -y openai
!pip install -q openai==0.28.0
import openai
openai.api_key = "INSERT KEY HERE"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Defining the CSV paths
csv_files = [
    "/content/drive/My Drive/W266_Final Project/clean_batch_1_output.csv",
    "/content/drive/My Drive/W266_Final Project/clean_batch_2_output.csv",
    "/content/drive/My Drive/W266_Final Project/clean_batch_3_output.csv",
    "/content/drive/My Drive/W266_Final Project/clean_batch_4_output.csv"
]

# Initialize an empty list to hold DataFrames
dataframes = []

# Read each CSV and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Combine all DataFrames into one
sampled_df = pd.concat(dataframes, ignore_index=True)

# Save the combined DataFrame as a new CSV file (optional)
output_path = "/content/drive/My Drive/W266_Final Project/combined_batches.csv"
sampled_df.to_csv(output_path, index=False)

print("Combined DataFrame created!")

Combined DataFrame created!


In [None]:
# Display datframe properties
print("Columns in combined_df:", combined_df.columns)
print()
print("Missing values:\n", combined_df.isnull().sum())
print()
print("Combined Size: ", combined_df.size)
print()
print("Combined Shape: ", combined_df.shape)

Columns in combined_df: Index(['article_cleaned', 'highlights_cleaned', 'llama_summary'], dtype='object')

Missing values:
 article_cleaned       0
highlights_cleaned    0
llama_summary         0
dtype: int64

Combined Size:  6000

Combined Shape:  (2000, 3)


In [None]:
combined_df.head()

Unnamed: 0,article_cleaned,highlights_cleaned,llama_summary
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,Here is a 4-5 line summary of the content:\n\n...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,Here is a 4-5 line summary of the article:\n\n...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,Here is a concise summary of the content in 4-...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey . Most ...,"Approximately 250 Syrian citizens, mostly ethn..."
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,Here is a 4-5 line summary of the article:\n\n...


In [None]:
def clean_summary(summary):
    return re.sub(r"Here is a.*?:\s*", "", summary).strip()       # Cleaning the line

# Applying the function
combined_df["llama_summary"] = combined_df["llama_summary"].apply(clean_summary)

print("Introductory lines removed from summaries.")

Introductory lines removed from summaries.


In [None]:
combined_df[['llama_summary']].sample(10)

Unnamed: 0,llama_summary
1726,"The lawyer representing Ethan Couch, a 16-year..."
850,A Chinese submarine accidentally damaged a US ...
1933,"A Texas man, George Rodriguez, who served 17 y..."
819,Covert scanners disguised as queue barriers wi...
464,"A British serviceman, Senior Aircraftsman Kini..."
1537,"Cassandra Morton, a 23-year-old black woman, w..."
893,"Daniel Parmertor, a 16-year-old student, was l..."
1547,"Roger Federer, the defending champion of the A..."
1026,"A 12-year-old boy, Tamir Rice, was shot and ki..."
1850,US President Barack Obama is hosting a summit ...


# Evaluation

## STS Score through Prompt and GPT-4

## Updated Prompt

In [None]:
# Set API key
openai.api_key = "INSERT KEY HERE"

def compute_sts_gpt4_client(reference, prediction):
    """
    Compute STS score between reference and prediction using GPT-4.

    Args:
        reference (str): The ground truth response.
        prediction (str): The predicted response.

    Returns:
        tuple: (STS score, JSON result)
    """
    prompt = f"""
You are an expert evaluator. Your task is to evaluate the generated summary against the reference summary
using the following simplified rubric and return the results in JSON.

### Criteria and Weights:
1. Coverage (30%)
2. Coherence and Flow (20%)
3. Faithfulness (30%)
4. Paraphrasing (15%)
5. Language and Style (5%)

For each category, assign a score from 1 to 5. Then, weight them according to their percentages and produce a final semantic score out of 100.

Finally, provide brief feedback (1-2 lines) for each category.

### Output Format:
Return the following JSON object:

{{
  "scores": {{
    "coverage": <int between 1-5>,
    "coherence_and_flow": <int between 1-5>,
    "faithfulness": <int between 1-5>,
    "paraphrasing": <int between 1-5>,
    "language_and_style": <int between 1-5>
  }},
  "final_semantic_score": <float or int out of 100>,
  "feedback": {{
    "coverage": "<feedback>",
    "coherence_and_flow": "<feedback>",
    "faithfulness": "<feedback>",
    "paraphrasing": "<feedback>",
    "language_and_style": "<feedback>"
  }}
}}

Reference Summary: {reference}
Generated Summary: {prediction}

Please return only the JSON object and nothing else.
"""

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        raw_response = response.choices[0].message.content.strip()
        result = json.loads(raw_response)

        # Extract scores
        coverage = result["scores"]["coverage"]
        coherence = result["scores"]["coherence_and_flow"]
        faithfulness = result["scores"]["faithfulness"]
        paraphrasing = result["scores"]["paraphrasing"]
        language = result["scores"]["language_and_style"]
        final_score = result["final_semantic_score"]
        feedback = result["feedback"]

        # Convert final_score (out of 100) to a 1-5 scale (if desired)
        sts_score = final_score / 20.0

        return sts_score, result

    except Exception as e:
        print(f"Error during GPT-4 API call: {e}")
        return None, None

print("Model has been Loaded")

Model has been Loaded


In [None]:
# Create a copy of combined_df to differentiate between different df's
sampled_df = combined_df.copy()

# Verify the new DataFrame
sampled_df.head(5)

Unnamed: 0,article_cleaned,highlights_cleaned,llama_summary
0,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,Here is a 4-5 line summary of the content:\n\n...
1,A couple who weighed a combined 32st were sham...,Couple started piling on pounds after the birt...,Here is a 4-5 line summary of the article:\n\n...
2,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,Here is a concise summary of the content in 4-...
3,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey . Most ...,"Approximately 250 Syrian citizens, mostly ethn..."
4,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,Here is a 4-5 line summary of the article:\n\n...


## Applying GPT-Model to sampled_df

In [None]:
sampled_df["sts_score"], sampled_df["result"] = zip(*sampled_df.apply(
    lambda row: compute_sts_gpt4_client(row["highlights_cleaned"], row["llama_summary"]),
    axis=1
))

print("Evaluations are completed")

Evaluations are completed


## Extracting scores and feedback

In [None]:
# Extract scores into their own columns
sampled_df["coverage_score"] = sampled_df["result"].apply(lambda r: r["scores"]["coverage"] if r else None)
sampled_df["coherence_and_flow_score"] = sampled_df["result"].apply(lambda r: r["scores"]["coherence_and_flow"] if r else None)
sampled_df["faithfulness_score"] = sampled_df["result"].apply(lambda r: r["scores"]["faithfulness"] if r else None)
sampled_df["paraphrasing_score"] = sampled_df["result"].apply(lambda r: r["scores"]["paraphrasing"] if r else None)
sampled_df["language_and_style_score"] = sampled_df["result"].apply(lambda r: r["scores"]["language_and_style"] if r else None)

# Extract feedback into their own columns
sampled_df["feedback_coverage"] = sampled_df["result"].apply(lambda r: r["feedback"]["coverage"] if r else None)
sampled_df["feedback_coherence_and_flow"] = sampled_df["result"].apply(lambda r: r["feedback"]["coherence_and_flow"] if r else None)
sampled_df["feedback_faithfulness"] = sampled_df["result"].apply(lambda r: r["feedback"]["faithfulness"] if r else None)
sampled_df["feedback_paraphrasing"] = sampled_df["result"].apply(lambda r: r["feedback"]["paraphrasing"] if r else None)
sampled_df["feedback_language_and_style"] = sampled_df["result"].apply(lambda r: r["feedback"]["language_and_style"] if r else None)

# sampled_df.drop('result', axis=1, inplace=True)
sampled_df.head(10)

sampled_df["final_semantic_score"] = sampled_df["result"].apply(
    lambda r: r["final_semantic_score"] if r else None
)

print("Extraction Completed")

Extraction Completed


In [None]:
# Check the total count of null values per column
print(sampled_df.isnull().sum())

sampled_df.info()
print(sampled_df.describe())

#verify ranges
assert sampled_df['coverage_score'].between(1, 5).all(), "coverage_score out of range (1-5)"
assert sampled_df['coherence_and_flow_score'].between(1, 5).all(), "coherence_and_flow_score out of range (1-5)"
assert sampled_df['faithfulness_score'].between(1, 5).all(), "faithfulness_score out of range (1-5)"
assert sampled_df['paraphrasing_score'].between(1, 5).all(), "paraphrasing_score out of range (1-5)"
assert sampled_df['language_and_style_score'].between(1, 5).all(), "language_and_style_score out of range (1-5)"
assert sampled_df['final_semantic_score'].between(1, 100).all(), "final_semantic_score out of range (1-100)"

article_cleaned                0
highlights_cleaned             0
llama_summary                  0
sts_score                      0
result                         0
coverage_score                 0
coherence_and_flow_score       0
faithfulness_score             0
paraphrasing_score             0
language_and_style_score       0
feedback_coverage              0
feedback_coherence_and_flow    0
feedback_faithfulness          0
feedback_paraphrasing          0
feedback_language_and_style    0
final_semantic_score           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   article_cleaned              2000 non-null   object 
 1   highlights_cleaned           2000 non-null   object 
 2   llama_summary                2000 non-null   object 
 3   sts_score                    2000 non-null   float64
 4  

# ROUGE Scores

In [None]:
# Convert sts_score and final_semantic_score to int
sampled_df["sts_score"] = sampled_df["sts_score"].astype(int)
sampled_df["final_semantic_score"] = sampled_df["final_semantic_score"].astype(int)

# Compute ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def compute_rouge_scores(row):
    reference = row["highlights_cleaned"]
    prediction = row["llama_summary"]
    scores = scorer.score(reference, prediction)
    return pd.Series({
        "rouge1": scores["rouge1"].fmeasure,
        "rouge2": scores["rouge2"].fmeasure,
        "rougeL": scores["rougeL"].fmeasure
    })

# Apply the function to compute and store ROUGE scores
sampled_df[["rouge1", "rouge2", "rougeL"]] = sampled_df.apply(compute_rouge_scores, axis=1)

print("ROUGE scores computed")
# Save new CSV
sampled_df.to_csv("/content/drive/My Drive/W266_Final Project/evaluated_samples_updated.csv", index=False)

ROUGE scores computed


# * END OF EVALUATION NOTEBOOK*