In [32]:
from llm import llm_response
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [155]:
old_prompt = """You are a helpful classifier that classifies the given Movie Review into Pathos, Ethos, Logos, or None.
    *** Only classify a review as None if the review is either:
    - Not in English
    - Review does not contain enough context to evalue upon (e.g. if the review is too short)

    Difference Between Pathos, Ethos, and Logos:
    1. Pathos (Emotional Appeal)
    • Appeals to emotions and feelings.
    • Used to evoke sympathy, excitement, nostalgia, or any emotional response from the audience.
    • Example: A movie review that describes how a film made the reviewer cry, laugh, or feel deeply connected to the characters.
    2. Ethos (Credibility/Authority)
    • Establishes the credibility or expertise of the reviewer.
    • Uses personal experience, professional background, or external validation to justify opinions.
    • Example: A review written by a well-known film critic or someone with experience in filmmaking.
    3. Logos (Logical Appeal)
    • Uses logic, facts, and reasoning to support an argument.
    • May include comparisons, statistics, or objective analysis.
    • Example: A review discussing the film’s cinematography, screenplay structure, or historical accuracy with evidence.

    ***IMPORTANT***
    Only return one the following 4: Pathos, Ethos, Logos, None
    Do NOT return any other response other than the 4.
"""

prompt = """You are a helpful classifier that classifies the given Movie Review into Pathos, Ethos, or Logos.
    Difference Between Pathos, Ethos, and Logos:
    1. Pathos (Emotional Appeal)
    • Appeals to emotions and feelings.
    • Used to evoke sympathy, excitement, nostalgia, or any emotional response from the audience.
    • Example: A movie review that describes how a film made the reviewer cry, laugh, or feel deeply connected to the characters.
    2. Ethos (Credibility/Authority)
    • Establishes the credibility or expertise of the reviewer - do not mistaken the reviewer for a character in the film or the director of the film.
    • Uses personal experience, professional background, or external validation to justify opinions.
    • Example: A review written by a well-known film critic or someone with experience in filmmaking.
    3. Logos (Logical Appeal)
    • Uses logic, facts, and reasoning to support an argument.
    • May include comparisons, statistics, or objective analysis.
    • Example: A review discussing the film’s cinematography, screenplay structure, or historical accuracy with evidence.

    Example:
    Ethos 
    “As an acclaimed film critic with over 20 years of experience, I can confidently say that The Godfather remains one of the most masterfully directed films in cinematic history. Coppola’s meticulous storytelling and Brando’s legendary performance solidify its place as a timeless classic.”

    Pathos 
    “With breathtaking cinematography and a heart-wrenching score, Schindler’s List leaves an unforgettable impact that lingers long after the credits roll. The film’s raw portrayal of human suffering is both devastating and deeply moving.”

    Logos
    “With a 98% rating on Rotten Tomatoes and three Academy Awards, Parasite is a masterclass in social commentary and suspense. Bong Joon-ho’s use of symbolism and tight narrative structure makes this a must-watch for any serious film enthusiast.”

    ***IMPORTANT***
    Only return one the following 3: Pathos, Ethos, Logos
    Do NOT return any other response other than the 3.
"""

In [None]:
# ### TEST
# review = "Grounded in extensive research and practical effects, Oppenheimer is a cinematic triumph that upholds the ethical responsibility of historical storytelling while offering a visually stunning and intellectually profound experience."
# question = f"Classify the following movie review: {review}"
# response = llm_response(prompt, question)
# print(response)

In [143]:
# ### TEST
# corpora_directory = "../modified_corpus_batches/json/"
# corpora_batches = {
#     "batch_1": "batch_1.json"
# }
# corpora_data = {
#     "batch_1": None
# }
# annotations = []

# for batch_name, corpora in corpora_batches.items():
#     count = 1 # TEST
#     data = pd.read_json(corpora_directory + corpora)
#     reviews = data["Review"]
#     print(f"Annotating {batch_name}...")
    
#     for review in tqdm(reviews, desc="Processing"):
#         if count == 0: break # TEST
#         question = f"Classify the following movie review: {review}"
#         annotations.append({
#             "Annotation": llm_response(prompt, question)
#         })
#         count-=1 # TEST

#     corpora_data[batch_name] = data.copy()
    
#     for annotation in annotations:
#         corpora_data[batch_name]["Annotation"] = annotation["Annotation"]

In [158]:
### Annotating batch_1 individually to ensure proper annotation is completed by gpt

corpora_directory = "../modified_corpus_batches/json/"
corpora_batches = {
    "batch_1": "batch_1.json",
    "batch_2": "batch_2.json",
    "batch_3": "batch_3.json",
    "batch_4": "batch_4.json"
}
corpora_data = {
    "batch_1": None,
    "batch_2": None,
    "batch_3": None,
    "batch_4": None
}
all_annotations = {}

for batch_name, corpora in corpora_batches.items():
    annotations = []
    data = pd.read_json(corpora_directory + corpora)
    reviews = data["Review"]
    print(f"Annotating {batch_name}...")
    
    for review in tqdm(reviews, desc="Processing"):
        question = f"Classify the following movie review: {review}"
        response = llm_response(prompt, question)
        annotations.append({
            "Annotation": response
        })

    all_annotations[batch_name] = annotations
    corpora_data[batch_name] = data.copy()
    
    for index, annotation in enumerate(annotations):
        corpora_data[batch_name].at[index, "Annotation"] = annotation["Annotation"]
    
    corpora_data[batch_name].to_json(corpora_directory + batch_name + "_gpt_annotated.json", orient="records", indent=2)
    

Annotating batch_1...


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

Annotating batch_2...


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

Annotating batch_3...


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

Annotating batch_4...


Processing:   0%|          | 0/251 [00:00<?, ?it/s]

In [164]:
corpora_data["batch_3"][:25]

Unnamed: 0,Reviewer,Profile_URL,Review,Annotation
0,Jenni Kaye,https://letterboxd.com/hatchetface/,I’m that Leo meme pointing at every new famili...,Pathos
1,cinemadelrey,https://letterboxd.com/cinemadelrey/,"Wow, i'm speechless!And Cilian Murphy's perfor...",Pathos
2,vitor,https://letterboxd.com/favzscream/,"ABSOLUTE CINEMA.que filme foda, as 3hrs passar...",Pathos
3,aditi,https://letterboxd.com/dianyugen/,One of the best movies of all time. Cillian Mu...,Pathos
4,teamgal,https://letterboxd.com/teamgal/,Death by sound design.,Logos
5,DolanDark,https://letterboxd.com/dolandark/,Can't believe they actually put Gangnam Style ...,Logos
6,the sam,https://letterboxd.com/the_sam/,I know for a fact me and my father have a grea...,Pathos
7,Anton Joska,https://letterboxd.com/antonj/,Well Done Christopher Nolan! Well done!,Ethos
8,Joshy,https://letterboxd.com/jozhster/,Visually gorgeous with Nolan’s now signature p...,Logos
9,Allan Arkush,https://letterboxd.com/aarkush/,"Deeply felt, every second was compelling. I co...",Pathos


In [166]:
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import LabelEncoder

annotation_mapping = {
    "Pathos": 0,
    "Ethos": 1,
    "Logos": 2,
    "Missing": 3
}

batches = ["batch_1", "batch_2", "batch_3", "batch_4"]

batch_1_df_david = pd.read_excel("../modified_corpus_batches/xlsx/" + "batch_1.xlsx", sheet_name="David")
batch_2_df_david = pd.read_excel("../modified_corpus_batches/xlsx/" + "batch_2.xlsx", sheet_name="David")
batch_1_df_daoming = pd.read_excel("../modified_corpus_batches/xlsx/" + "batch_1.xlsx", sheet_name="Daoming")
batch_2_df_daoming = pd.read_excel("../modified_corpus_batches/xlsx/" + "batch_2.xlsx", sheet_name="Daoming")
david_annotation_1 = batch_1_df_david["Annotation"].fillna("Missing")
david_annotation_2 = batch_2_df_david["Annotation"].fillna("Missing")
daoming_annotation_1 = batch_1_df_daoming["Annotation"].fillna("Missing")
daoming_annotation_2 = batch_2_df_daoming["Annotation"].fillna("Missing")
david_encoded_1 = david_annotation_1.map(annotation_mapping)
david_encoded_2 = david_annotation_2.map(annotation_mapping)
daoming_encoded_1 = daoming_annotation_1.map(annotation_mapping)
daoming_encoded_2 = daoming_annotation_2.map(annotation_mapping)

for batch in batches:
    gpt_annotation = corpora_data[batch]["Annotation"][:25].replace("None", "Missing")
    gpt_encoded = gpt_annotation.map(annotation_mapping)

    if batch.endswith("1"):
        david_gpt_kappa_score = cohen_kappa_score(david_encoded_1, gpt_encoded)
        daoming_gpt_kappa_score = cohen_kappa_score(daoming_encoded_1, gpt_encoded)
        print(f"{batch} cohen's kappa between David and GPT: {david_gpt_kappa_score}")
        print(f"{batch} cohen's kappa between Daoming and GPT: {daoming_gpt_kappa_score}")
    if batch.endswith("2"):
        david_gpt_kappa_score = cohen_kappa_score(david_encoded_2, gpt_encoded)
        daoming_gpt_kappa_score = cohen_kappa_score(daoming_encoded_2, gpt_encoded)
        print(f"{batch} cohen's kappa between David and GPT: {david_gpt_kappa_score}")
        print(f"{batch} cohen's kappa between Daoming and GPT: {daoming_gpt_kappa_score}")

# predefined mapping
# label_encoder = LabelEncoder()
# david_encoded = label_encoder.fit_transform(david_annotation)
# daoming_encoded = label_encoder.transform(daoming_annotation)
# gpt_encoded = label_encoder.transform(gpt_annotation)

batch_1 cohen's kappa between David and GPT: 0.526813880126183
batch_1 cohen's kappa between Daoming and GPT: 0.23469387755102045
batch_2 cohen's kappa between David and GPT: 0.1071428571428572
batch_2 cohen's kappa between Daoming and GPT: 0.16666666666666674


In [65]:
corpora_directory = "../modified_corpus_batches/json/"
corpora_batches = {
    "batch_2": "batch_2.json",
    "batch_3": "batch_3.json",
    "batch_4": "batch_4.json"
}
corpora_data = {
    "batch_2": None,
    "batch_3": None,
    "batch_4": None
}
all_annotations = {}

for batch_name, corpora in corpora_batches.items():
    annotations = []
    data = pd.read_json(corpora_directory + corpora)
    reviews = data["Review"]
    print(f"Annotating {batch_name}...")
    
    for review in tqdm(reviews, desc="Processing"):
        question = f"Classify the following movie review: {review}"
        annotations.append({
            "Annotation": llm_response(prompt, question)
        })

    all_annotations[batch_name] = annotations
    corpora_data[batch_name] = data.copy()
    
    for annotation in annotations:
        corpora_data[batch_name]["Annotation"] = annotation["Annotation"]
    

Annotating batch_2...


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

Annotating batch_3...


Processing:   0%|          | 0/250 [00:00<?, ?it/s]

Annotating batch_4...


Processing:   0%|          | 0/251 [00:00<?, ?it/s]

In [70]:
corpora_data["batch_4"]["Annotation"]

0      None
1      None
2      None
3      None
4      None
       ... 
246    None
247    None
248    None
249    None
250    None
Name: Annotation, Length: 251, dtype: object