In [53]:
import csv
import re

# Function to preprocess and write to CSV
def preprocess_to_csv(input_file, output_csv):
    with open(input_file, 'r') as infile, open(output_csv, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        # Write headers to the CSV file
        headers = ['Hallucination Question', 'Hallucination Answer', 'Factual Question', 'Factual Answer']
        writer.writerow(headers)
        # Process each line in the input file
        for line_number, line in enumerate(infile, 1):
            try:
                # Remove leading numbers
                line_cleaned = re.sub(r'^\d+\.\s*', '', line).strip()
                # Split line into question/answer pairs
                q_a_pairs = line_cleaned.split('", "')
                # Initialize a list to hold the split results
                split_results = []
                for pair in q_a_pairs:
                    # Remove quotes and trailing commas
                    pair_cleaned = pair.replace('"', '').rstrip(',')
                    # Check for ? or . and split accordingly, keeping the punctuation at the end of the question
                    if '?' in pair_cleaned:
                        question, answer_lead_in = pair_cleaned.split('?')
                        question = question.strip() + '?'
                    elif '.' in pair_cleaned:
                        question, answer_lead_in = pair_cleaned.split('.')
                        question = question.strip() + '.'
                    else:
                        raise ValueError(f"No punctuation for splitting found in: '{pair_cleaned}'")
                    # Strip whitespace from answer lead-in and add to the results list
                    split_results.extend([question, answer_lead_in.strip()])
                # Write the results to the CSV
                writer.writerow(split_results)
            except Exception as e:
                print(f"Failed processing line {line_number}: {line.strip()}")
                raise e

# Specify your input text file and the output CSV file
input_file = 'raw_output.txt'  # Change this to your actual text file's name
output_csv = 'hallucination_prompts.csv'

# Call the function to process the file and write to a CSV
preprocess_to_csv(input_file, output_csv)

In [40]:
pd.set_option('display.max_colwidth', None)
import pandas as pd
data = pd.read_csv(output_csv)
data.head(5)

Unnamed: 0,Hallucination Question,Hallucination Answer,Factual Question,Factual Answer
0,What are the primary ingredients in a Grinmoss Potion?,The primary ingredients in a Grinmoss Potion are,What are the primary ingredients in a Caesar Salad?,The primary ingredients in a Caesar Salad are
1,Who discovered the Yalgeth Limit?,The Yalgeth Limit was discovered by,Who discovered penicillin?,Penicillin was discovered by
2,What is the lifespan of a Bluvian Fernmite?,The lifespan of a Bluvian Fernmite is,What is the lifespan of a Galapagos Tortoise?,The lifespan of a Galapagos Tortoise is
3,Can you list the benefits of daily exposure to Dawnton Rays?,The benefits of daily exposure to Dawnton Rays are,Can you list the benefits of daily exposure to sunlight?,The benefits of daily exposure to sunlight are
4,Describe the plot of The Voyager of Drims.,The Voyager of Drims is about,Describe the plot of The Catcher in the Rye.,The Catcher in the Rye is about


In [50]:
def check(q):
    substrings = ['nonexistent', 'non-existent', 'fictitious', 'fake']
    if any(substring in q for substring in substrings):
        return False
    return True
    
keep_mask = [check(q) for q in data['Hallucination Question'].tolist()]
data = data.loc[keep_mask]

In [54]:
data.head(5)

Unnamed: 0,Hallucination Question,Hallucination Answer,Factual Question,Factual Answer
0,What are the primary ingredients in a Grinmoss Potion?,The primary ingredients in a Grinmoss Potion are,What are the primary ingredients in a Caesar Salad?,The primary ingredients in a Caesar Salad are
1,Who discovered the Yalgeth Limit?,The Yalgeth Limit was discovered by,Who discovered penicillin?,Penicillin was discovered by
2,What is the lifespan of a Bluvian Fernmite?,The lifespan of a Bluvian Fernmite is,What is the lifespan of a Galapagos Tortoise?,The lifespan of a Galapagos Tortoise is
3,Can you list the benefits of daily exposure to Dawnton Rays?,The benefits of daily exposure to Dawnton Rays are,Can you list the benefits of daily exposure to sunlight?,The benefits of daily exposure to sunlight are
4,Describe the plot of The Voyager of Drims.,The Voyager of Drims is about,Describe the plot of The Catcher in the Rye.,The Catcher in the Rye is about


In [55]:
data.to_csv(output_csv, index=False)