In [40]:
import sys
import os
from src.utils.gpt import GPT
from src.utils.data_utils import cnn_news_loader, sample_cnn_news
import tiktoken
from dotenv import dotenv_values
import ollama

configs = dotenv_values("src/.env")

SEED = 10
N_news = 1000
N_reviews = 1000
MAX_TOKENS = 100

## Part I: Summarization

**Dataset:** 
* CNN/ Daily Mail dataset contain 300k English-language news article written by journalists at CNN and Daily Mail. The news article are paired with human-written summaries, which serve as ground truth. We will sample a subset from the testing bin. 
* The dataset can be downloaded at https://huggingface.co/datasets/abisee/cnn_dailymail/tree/main/3.0.0 under name `test-00000-of-00001.parquet`. Save the dataset to `data/cnn_news/test/` folder.


In [2]:
# Explore CNN news data

import pandas as pd

test_news = cnn_news_loader("data/cnn_news/test/test-00000-of-00001.parquet")
print("Total number of news articles:", len(test_news))

# Sample a few news articles
sampled_news = sample_cnn_news(test_news, N_news, SEED)
print("Sampled news articles:", len(sampled_news))

Total number of news articles: 11490
Sampled news articles: 1000


In [3]:
# Look at 3 news articles in detail

for i in range(3):
    print("News article:", i+1)
    print("Article: ",sampled_news.iloc[i]["article"])
    print("Highlights: ", sampled_news.iloc[i]["highlights"])
    print("\n")

News article: 1
Article:  A convicted fraudster stole £17,000 from the City firm where she worked after she started embezzling money on her second day in the job. Gabrielle Yinka Saunders, 32, used company credit cards to pay for her £10,000 wedding, as well as a £5,000 honeymoon in the Seychelles. She was arrested at Heathrow when she returned from the honeymoon - but now she has been spared jail even though a court heard that she had been convicted of fraud twice previously. Just married: But Gabrielle Yinka Saunders, pictured during her wedding to David Osborne, stole £17,000 to pay for the ceremony and her luxury honeymoon . Fraud: Saunders, pictured outside the Old Bailey, admitted cheating her company out of thousands of pounds . Saunders, also known as Gabriella Osborne since getting married at Belair House in Dulwich last September, stole £35,000 from her previous employer PriceWaterhouseCoopers, where she worked as a tax accountant. After serving a jail sentence, she started a

In [4]:
# Average summary token size

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
tokens = sampled_news["highlights"].apply(lambda x: len(encoding.encode(x)))
print(tokens.mean())

66.479


In [41]:
# Generate news summary for each news article in sampled_news by using GPT-3.5-turbo and GPT-4o-mini models
# Save the summary in a new column in the dataframe, named "summary_gpt3" and "summary_gpt4"

# Load GPT models
api_key = configs["OPENAI_API_KEY"]
gpt3 = GPT(api_key,"gpt-3.5-turbo")
gpt4 = GPT(api_key,"gpt-4o-mini")

# Define the function to generate summary
def generate_summary_per_row(df, model, column_name, max_tokens):
    # Generate summary for each row in the dataframe
    df = df.copy()
    df[column_name] = df["article"].apply(lambda x: model._summarize(x, max_tokens))
    return df


# Llama3 model
def summarize_text_llama3(model, text, max_tokens=200):
    response = ollama.chat(model=model, messages=[
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user',
          'content': f'Summarize the following news article within {max_tokens} tokens:\n {text}.\nSummary:',
        }
    ])
    result = response['message']['content']
    return result


def classify_sentiment_llama3(model, text):
    response = ollama.chat(model=model, messages=[
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': f'Classify the sentiment of the following movie reviews and output the sentiment as a single word "positive" or "negative". Text:\n{text}',
        }
    ])
    result = response['message']['content']
    return result


def generate_summary_llama3_per_row(df, model, column_name, max_tokens):
    # Generate summary for each row in the dataframe
    df = df.copy()
    df[column_name] = df["article"].apply(lambda x: summarize_text_llama3(model, x, max_tokens))
    return df

In [9]:
# Track the time taken to generate summary for each news article

import time

# $14.5 -> $13.74

# GPT-3.5-turbo
start_time_1 = time.time()
sampled_news = generate_summary_per_row(sampled_news, gpt3, "summary_gpt3", MAX_TOKENS)
end_time_1 = time.time()
print("Time taken to generate summary for each news article using GPT-3.5-turbo: ", end_time_1 - start_time_1)


# GPT-4o-mini
start_time_2 = time.time()
sampled_news = generate_summary_per_row(sampled_news, gpt4, "summary_gpt4", MAX_TOKENS)
end_time_2 = time.time()
print("Time taken to generate summary for each news article using GPT-4o-mini: ", end_time_2 - start_time_2)

sampled_news.to_csv('data/cnn_news/output/sampled_news_gpt.csv', index=False)

Time taken to generate summary for each news article using GPT-3.5-turbo:  1586.7160348892212
Time taken to generate summary for each news article using GPT-4o-mini:  1397.1406872272491


In [43]:
# Llama 3.1
start_time_3 = time.time()
sampled_news = generate_summary_llama3_per_row(sampled_news, "llama3.1", "summary_llama3", MAX_TOKENS)
end_time_3 = time.time()
print("Time taken to generate summary for each news article using Llama 3.1 ", end_time_3 - start_time_3)

sampled_news.to_csv('data/cnn_news/output/sampled_news_gpt_llama3.csv', index=False)

Time taken to generate summary for each news article using Llama 3.1  11266.976531028748


### Evaluation
* Rouge score, i.e. 1-gram, 2-gram and longest common subsequence.
* Bleu score

In [44]:
from src.utils.evaluation import evaluate_summary_performance, calculate_accuracy

# GPT-3.5-turbo
gpt3_news, gpt3_summary = sampled_news["highlights"].tolist(), sampled_news["summary_gpt3"].tolist()
gpt3_rouge = evaluate_summary_performance(gpt3_news, gpt3_summary)

# GPT-4o-mini
gpt4_news, gpt4_summary = sampled_news["highlights"].tolist(), sampled_news["summary_gpt4"].tolist()
gpt4_rouge = evaluate_summary_performance(gpt4_news, gpt4_summary)

print("GPT-3.5-turbo")
print("ROUGE-1: ", gpt3_rouge['rouge1'])
print("ROUGE-2: ", gpt3_rouge['rouge2'])
print("ROUGE-L: ", gpt3_rouge['rougeL'])
print("\n")
print("GPT-4o-mini")
print("ROUGE-1: ", gpt4_rouge['rouge1'])
print("ROUGE-2: ", gpt4_rouge['rouge2'])
print("ROUGE-L: ", gpt4_rouge['rougeL'])

GPT-3.5-turbo
ROUGE-1:  0.3430620294062374
ROUGE-2:  0.1038963261020401
ROUGE-L:  0.20981706125120478


GPT-4o-mini
ROUGE-1:  0.36672165373927307
ROUGE-2:  0.12355515098172702
ROUGE-L:  0.22207151243856477


In [45]:
# Llama3

llama3_news, llama3_summary = sampled_news["highlights"].tolist(), sampled_news["summary_llama3"].tolist()
llama3_rouge = evaluate_summary_performance(llama3_news, llama3_summary)

print("Llama3")
print("ROUGE-1: ", llama3_rouge['rouge1'])
print("ROUGE-2: ", llama3_rouge['rouge2'])
print("ROUGE-L: ", llama3_rouge['rougeL'])


Llama3
ROUGE-1:  0.3851959270138329
ROUGE-2:  0.13993070303573268
ROUGE-L:  0.2345538448687383


Show randomly selected examples and generated summaries

In [53]:
# Pick 3 news articles at random 
# print the article, the actual highlights, the summary generated by GPT-3.5-turbo, GPT-4o-mini and Llama3

import random

random.seed(SEED)
random_sample = sampled_news.sample(5)
for row in random_sample.iterrows():
    print("Article: ", row[1]["article"])
    print("Highlights: ", row[1]["highlights"])
    print("Summary GPT-3.5-turbo: ", row[1]["summary_gpt3"])
    print("Summary GPT-4o-mini: ", row[1]["summary_gpt4"])
    print("Summary Llama3: ", row[1]["summary_llama3"])
    print("\n")

Article:  Ed Balls was today accused of 'letting the cat out of the bag' on tax rises after leaving the door open to trapping more middle-class workers in the 40p tax rate. In an interview the shadow chancellor repeatedly refused to rule out trying to balance the books by lowering the amount workers have to earn before they pay the higher income tax rate. The number of people paying 40p tax has already risen from about 2million to 5million in two decades, but the Tories have promised to lift the threshold if they are returned to power. Scroll down for video . Shadow chancellor Ed Balls, pictured giving a speech in Glasgow today, has repeatedly refused to rule out using the point at which the higher income tax rate kicks in to raise money to balance the books . The Shadow Chancellor was joined by the Scottish Labour Leader Jim Murphy at the Royal Concert hall in Glasgow today . Labour has vowed not to increase VAT or National Insurance as part of measures to bring down the deficit if th

## Part II: Sentiment Classification

**Dataset:** 

IMDB movie reviews dataset contain 50k highly-polar movie reviews and binary-labeled sentiment, i.e. positive and negative.

In [19]:
from src.utils.data_utils import imdb_review_loader, sample_movie_review

reviews, classes = imdb_review_loader("data/imdb_movie_reviews/IMDB_Dataset.csv")
print("Total number of news articles:", len(reviews))
print("Sentiment classes:", classes)

# Sample a few news articles
sampled_reviews = sample_movie_review(reviews, N_reviews, SEED)
print("Sampled news articles:", len(sampled_reviews))

Total number of news articles: 50000
Sentiment classes: ['positive', 'negative']
Sampled news articles: 1000


In [29]:
sampled_reviews['sentiment'].value_counts()

sentiment
positive    510
negative    490
Name: count, dtype: int64

In [20]:
sampled_reviews

Unnamed: 0,review,sentiment
27632,"This is one of the very, very few films that a...",positive
36119,"Fred ""The Hammer"" Williamson delivers another ...",negative
4796,"When the trailer for Accepted first came up, m...",positive
3648,Buddy is an entertaining family film set in a ...,positive
24501,I thought the kids in the movie were great. I ...,positive
...,...,...
38289,I would rate this film high on my list of Ingr...,positive
12693,The main criticism of AT THE EARTH'S CORE is t...,positive
21049,It's obvious that the people who made 'Dead At...,negative
7348,Both Robert Duvall and Glenn Close played thei...,positive


In [74]:
gpt3 = GPT(api_key,"gpt-3.5-turbo")
gpt4 = GPT(api_key,"gpt-4o-mini")

def parse_sentiment(x):
    if "positive" in x.lower():
        return "positive"
    elif "negative" in x.lower():
        return "negative"
    else:
        return ""

def classify_sentiment_per_row(df, model, column_name):
    # Generate summary for each row in the dataframe
    df = df.copy()
    df[column_name] = df["review"].apply(lambda x: model._classify_sentiment(x))    
    # Parse the sentiment
    df[column_name] = df[column_name].apply(lambda x: parse_sentiment(x))
    return df

def classify_sentiment_llama3_per_row(df, model, column_name):
    # Generate summary for each row in the dataframe
    df = df.copy()
    df[column_name] = df["review"].apply(lambda x: classify_sentiment_llama3(model, x))    
    # Parse the sentiment
    df[column_name] = df[column_name].apply(lambda x: parse_sentiment(x))
    return df

# GPT-3.5-turbo
start_time_3 = time.time()
sampled_reviews = classify_sentiment_per_row(sampled_reviews, gpt3, "sentiment_gpt3")
end_time_3 = time.time()
print("Time taken to classify sentiment for each news article using GPT-3.5-turbo: ", end_time_3 - start_time_3)


# GPT-4o-mini
start_time_4 = time.time()
sampled_reviews = classify_sentiment_per_row(sampled_reviews, gpt4, "sentiment_gpt4")
end_time_4 = time.time()
print("Time taken to classify sentiment for each news article using GPT-4o-mini: ", end_time_4 - start_time_4)

sampled_reviews.to_csv('data/imdb_movie_reviews/output/sampled_reviews_gpt.csv', index=False)

Time taken to classify sentiment for each news article using GPT-3.5-turbo:  605.8195822238922
Time taken to classify sentiment for each news article using GPT-4o-mini:  1135.01295876503


In [61]:
def classify_sentiment_llama3_per_row(df, model, column_name):
    # Generate summary for each row in the dataframe
    df = df.copy()
    df[column_name] = df["review"].apply(lambda x: classify_sentiment_llama3(model, x))    
    # Parse the sentiment
    df[column_name] = df[column_name].apply(lambda x: parse_sentiment(x))
    return df

start_time_6 = time.time()
sampled_reviews = classify_sentiment_llama3_per_row(sampled_reviews, "llama3.1", "sentiment_llama3")
end_time_6 = time.time()
print("Time taken to classify sentiment for each news article using Llama3.1: ", end_time_6 - start_time_6)

sampled_reviews.to_csv('data/imdb_movie_reviews/output/sampled_reviews_gpt_llama3.csv', index=False)

Time taken to classify sentiment for each news article using Llama3.1:  2505.2234642505646


In [75]:
# Evaluate accuracy

ground_truth_sentiments, gpt3_predicted_sentiments = sampled_reviews["sentiment"].tolist(), sampled_reviews["sentiment_gpt3"].tolist()
ground_truth_sentiments, gpt4_predicted_sentiments = sampled_reviews["sentiment"].tolist(), sampled_reviews["sentiment_gpt4"].tolist()
ground_truth_sentiments, llama3_predicted_sentiments = sampled_reviews["sentiment"].tolist(), sampled_reviews["sentiment_llama3"].tolist()


gpt3_accuracy = calculate_accuracy(ground_truth_sentiments, gpt3_predicted_sentiments)
gpt4_accuracy = calculate_accuracy(ground_truth_sentiments, gpt4_predicted_sentiments)
llama3_accuracy = calculate_accuracy(ground_truth_sentiments, llama3_predicted_sentiments)


In [76]:
print("GPT-3.5-turbo")
print("Accuracy: ", gpt3_accuracy)
print("\n")
print("GPT-4o-mini")
print("Accuracy: ", gpt4_accuracy)
print("Llama3")
print("Accuracy: ", llama3_accuracy)

GPT-3.5-turbo
Accuracy:  0.923


GPT-4o-mini
Accuracy:  0.9
Llama3
Accuracy:  0.934


Check examples where Llama3 gets right but GPT4o-mini gets wrong

In [77]:
examples = sampled_reviews[
    (sampled_reviews["sentiment"] != sampled_reviews["sentiment_gpt4"]) &
    (sampled_reviews["sentiment"] == sampled_reviews["sentiment_llama3"]) 
].sample(3)[["review", "sentiment_gpt4", "sentiment_llama3"]]


for row in examples.iterrows():
    print("Review: ", row[1]["review"])
    print("Sentiment GPT-4o-mini: ", row[1]["sentiment_gpt4"])
    print("Sentiment Llama3: ", row[1]["sentiment_llama3"])
    print("\n")

Review:  The acting, other reviews notwithstanding, was remarkably well-done. Brad Pitt handles the role of an annoying, obnoxious Austrian climber quite well. Other acting is fine. The story could have been riveting, but somehow, it misses - one never really understands or cares for the characters shown, and so the story, which could have been quite dramatic, fails to draw in this audience.<br /><br />Beautiful scenery and cinematography, a remarkably dramatic true story, important events that shaped the world that we live in - but I could not, try as I might, involve myself in this story. As an unabashed Brad Pitt fan (I consider him one of the top 5 actors of his generation), I expected to *love* this flick - and yet, it left me cold.<br /><br />It could be a failing within myself, but I tend to point toward the creative end of this movie - direction, scriptwriting, production, editing - somehow, they lost me. It's a shame, because it could have been wonderful.<br /><br />Good actin