In [31]:
import pandas as pd
import spacy
import random
from textblob import TextBlob
from nltk.corpus import wordnet

# Ensure the necessary NLTK data is downloaded
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Example dataframe
data = {
    'summary_sentence': [
        "Uncle Pros is helping Laurella, his sick niece, by borrowing a cradle for her newborn baby.",
        "Laurella explains that her husband, Consadine, is a poor provider and that he often goes off to make his fortune elsewhere."
    ],
    'text_chunk': [
        "", 
        ""
    ]
}

df = pd.DataFrame(data)

# Function to find the antonym of an adjective
def get_antonym(word):
    antonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())
    return antonyms[0] if antonyms else word

# Function to introduce inaccuracies using NLP
def introduce_inaccuracies(summary):
    doc = nlp(summary)
    blob = TextBlob(summary)
    
    # List of possible modifications
    modifications = [
        "negate a verb",
        "change a named entity",
        "replace an adjective with its opposite",
        "change a quantity",
        "swap character roles",
        "introduce a factual error",
        "add a misleading detail",
        "remove a key detail"
    ]
    
    choice = "negate a verb"
    
    # Negate a verb in the summary
    if choice == "negate a verb":
        for token in doc:
            if token.pos_ == "VERB":
                return summary.replace(token.text, "not " + token.text)
    
    # Change a named entity in the summary
    elif choice == "change a named entity":
        for ent in doc.ents:
            if ent.label_ in ["PERSON", "ORG", "GPE"]:
                return summary.replace(ent.text, "someone else")
    
    # Replace an adjective with its opposite
    elif choice == "replace an adjective with its opposite":
        for word, pos in blob.tags:
            if pos == 'JJ':  # Adjective
                antonym = get_antonym(word)
                return summary.replace(word, antonym)
    
    # Change a quantity in the summary
    elif choice == "change a quantity":
        for token in doc:
            if token.pos_ == "NUM":
                return summary.replace(token.text, str(int(token.text) + random.randint(1, 10)))
    
    # Swap character roles in the summary
    elif choice == "swap character roles":
        characters = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
        if len(characters) >= 2:
            return summary.replace(characters[0], "TEMP").replace(characters[1], characters[0]).replace("TEMP", characters[1])
    
    # Introduce a factual error in the summary
    elif choice == "introduce a factual error":
        return summary + " However, it is later revealed that this is incorrect."
    
    # Add a misleading detail
    elif choice == "add a misleading detail":
        misleading_details = [
            " Despite this, the situation was actually very different.",
            " Interestingly, this contradicts what was later found.",
            " In reality, things were quite the opposite."
        ]
        return summary + random.choice(misleading_details)
    
    # Remove a key detail from the summary
    elif choice == "remove a key detail":
        tokens = summary.split()
        if len(tokens) > 5:
            return " ".join(tokens[:-5]) + "..."
    
    # Default case: append an inaccuracy if no specific modification is made
    return summary + " However, this is later proven to be false."

# Apply the inaccuracies
df['summary_sentence'] = df['summary_sentence'].apply(introduce_inaccuracies)

# Display the modified dataframe
df

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jonathanhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jonathanhu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,summary_sentence,text_chunk
0,"Uncle Pros is not helping Laurella, his sick n...",
1,"Laurella not explains that her husband, Consad...",


In [78]:
df = pd.read_csv("mapped_summaries_l3.csv")

In [82]:
def get_first_10_summary_sentences(group):
    # Get the first 10 summary sentences
    first_10_sentences = group.head(10)
    return first_10_sentences

# Apply the function to each group and reset the index
filtered_df = df.groupby('book_num').apply(get_first_10_summary_sentences).reset_index(drop=True)


In [83]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Initialize the model and tokenizer
model_name = 'minwhoo/bart-base-negative-claim-generation'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [85]:
import torch
from tqdm import tqdm

# Function to transform a sentence using the model
def transform_sentences(sentences):
    batch = tokenizer(sentences, max_length=1024, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(batch['input_ids'], num_beams=5)
    return tokenizer.batch_decode(out, skip_special_tokens=True)

# Process sentences in batches
max_tokens = 1024
batch_size = 10 # Adjust this batch size as needed
modified_sentences = []

for start_idx in tqdm(range(0, len(filtered_df), batch_size), total=len(filtered_df)/10, desc="Processing summary claims"):
    batch_sentences = filtered_df['summary_sentence'][start_idx:start_idx + batch_size].tolist()
    modified_sentences.extend(transform_sentences(batch_sentences))

# Add the modified sentences to the dataframe
filtered_df['summary_sentence'] = modified_sentences

Processing summary claims: 100%|██████████| 150/150.0 [02:51<00:00,  1.14s/it]


In [87]:
df = filtered_df[filtered_df['summary_sentence'].str.endswith('.')]


In [90]:
df.to_csv("negative_claims.csv", index=False)

In [91]:
df

Unnamed: 0,model_size,book_num,summary_sentence_num,summary_sentence,text_chunk
2,175b,0,2,Pros takes the cradle outside to get the first...,"rich, broken light from the cavernous fireplac..."
4,175b,0,4,Pros finishes repair the cradle and removes th...,"she wants--ain't ye, Pretty?""\n\nAnd, having m..."
5,175b,0,5,"Laurella decide to name the baby Johnnie, afte...",hit the name that should 'a' went with the clo...
7,175b,0,7,"Johnnie's mother, Laurella, is absent from the...","walk. Her mother would get up too, and that wa..."
8,175b,0,8,Laurella reluctantly refuses to let Johnnie go...,"and able.""\n\nAnd sighingly--yet light-hearted..."
...,...,...,...,...,...
1491,175b,266,1,"In England, there is a lack of order and prote...","rustic mire, snuffed about by pigs, and rooste..."
1492,175b,266,2,A man walks up the Dover road on a Saturday mo...,\nII. The Mail\n\n\nIt was the Dover road that...
1494,175b,266,4,"Suddenly, the guard hears a horse coming at a ...","\n""I say a horse at a canter coming up, Joe.""\..."
1498,175b,266,8,The messenger assures himself that it would be...,"muffled again.\n\n""No, Jerry, no!"" said the me..."
