In [121]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [122]:
# Define templates for Toxic 2 sentences (25 templates)
toxic_2_templates = [
    "{person}, it is quite impressive how consistently you manage to be such a {insult}.",
    "You truly outdid yourself this time, {person}. What a {insult} you are.",
    "Honestly, {person}, you must take pride in being such a {insult}.",
    "It is a remarkable talent, {person}, to be such a {insult} and remain unaware.",
    "Every time {person} speaks, I cannot help but think, 'What a {insult}.'",
    "Why does {person} always choose to behave like a {insult}? It is baffling.",
    "I did not think it was possible, but {person} proves that being a {insult} can be a full-time occupation.",
    "{person}, keep it up! Your dedication to being a {insult} is truly something remarkable.",
    "It is astonishing how {person} manages to maintain such a {insult} demeanor every single day.",
    "I admire {person}'s commitment to being a {insult} at all times.",
    "You know, {person}, being a {insult} seems to be your specialty.",
    "It is almost impressive how {person} always finds a way to be a {insult}.",
    "Do not worry, {person}, we all need a {insult} in our lives from time to time.",
    "I cannot believe how much of a {insult} you continue to be, {person}.",
    "Why does {person} even try? It would be better to embrace being a {insult}.",
    "{person}, do you ever get tired of being such a {insult}?",
    "Every encounter with {person} serves as a reminder of what a {insult} looks like.",
    "It is fascinating how {person} keeps redefining what it means to be a {insult}.",
    "Seriously, {person}, is being a {insult} a goal of yours?",
    "{person}, I genuinely wonder how you manage to be such a {insult}.",
    "You are setting new records for being a {insult}, {person}.",
    "If being a {insult} were an Olympic sport, you would surely win gold, {person}.",
    "I cannot help but laugh when I see how {insult} you can be, {person}.",
    "Keep being yourself, {person}, but please try not to be such a {insult}.",
    "It is not easy being a {insult}, but {person} makes it seem effortless.",
    "{person}, if only you were aware of how {insult} you truly are.",
]

# Define potential toxic insults (17 insults)
insult = [
    "clown",
    "joke",
    "failure",
    "mess",
    "drama queen",
    "whiner",
    "loser",
    "nobody",
    "faux expert",
    "underachiever",
    "disappointment",
    "simpleton",
    "nuisance",
    "drama magnet",
]

# Define potential person (13 person)
person = [
    "he",
    "she",
    "my colleague",
    "that guy",
    "that girl",
    "the manager",
    "my friend",
    "the client",
    "the supervisor",
    "the director",
    "my boss",
    "that customer",
    "the intern",
]

In [123]:
# Generate the Cartesian product of all combinations


def generate_deterministic_toxic_2_comments():
    combinations = itertools.product(toxic_2_templates, insult, person)
    comments = [
        template.format(insult=insult, person=person)
        for template, insult, person in combinations
    ]

    # Slice to 5000 rows
    return comments[:4495]

In [124]:
# Generate a unique ID (e.g., comment id)


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [125]:
# Generate a unique link ID (e.g., "t3_" followed by 6 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [126]:
# Generate a random timestamp (date and time) in 2023


def generate_random_timestamp(year=2023):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )

    # Generate random hours, minutes, and seconds
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )

    # Combine the random date with the random time
    full_random_datetime = random_date + random_time

    # Return in the desired format
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [127]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [128]:
# Generate 5000 deterministic comments
toxic_2_data = generate_deterministic_toxic_2_comments()

In [129]:
# CSV file columns
csv_columns = [
    "text",
    "timestamp",
    "username",
    "link",
    "link_id",
    "parent_id",
    "id",
    "subreddit_id",
    "moderation",
    "year",
    "concatenated_count",
    "complete_thread",
    "gold_label",
    "generated_data",
]

In [130]:
# Constant values
subreddit_id = "t5_2qh8c"
moderation = "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}"
year = "2022"
concatenated_count = 1
complete_thread = True
gold_label = "Toxic 2"
generated_data = True

In [131]:
# Prepare rows for CSV
csv_data = []
for i in range(4495):
    text = toxic_2_data[i]
    timestamp = generate_random_timestamp()
    username = generate_username()
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()
    link = f"/r/singapore/comments/13c3mt8/Strict_appearance_and_grooming_standards_for_Singapore_Airlines_cabin_crew/{comment_id}/"

    # Create a row for the CSV
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": subreddit_id,
        "moderation": moderation,
        "year": year,
        "concatenated_count": concatenated_count,
        "complete_thread": complete_thread,
        "gold_label": gold_label,
        "generated_data": generated_data,
    }

    csv_data.append(row)

In [132]:
# After generating all the rows in csv_data, shuffle them
random.shuffle(csv_data)

# Writing the shuffled data to a CSV file
csv_file = "toxic_2_comments.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(csv_data)

print(f"CSV file '{csv_file}' generated successfully.")

CSV file 'toxic_2_comments.csv' generated successfully.
