In [40]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [41]:
# Define templates for Toxic 3 sentences (20 templates)
toxic_3_templates = [
    "{person}, you are such a {insult}. Just wait until I {threat}.",
    "Honestly, nobody likes {person}. I will {threat} if you keep acting this way.",
    "{person}, you really need to stop being such a {insult}, or I might just {threat}.",
    "{person}, you are the worst! I swear I will {threat} if you do not change your behavior.",
    "{person}, you make me sick. If you do not back off, I will {threat}.",
    "{person}, you are a total {insult}. You better watch out because I could easily {threat}.",
    "{person}, you think you are so clever, but you are just a {insult}. I might {threat} if you keep this up.",
    "Do not test me, {person}. You might just find that I will {threat} at any moment.",
    "{person}, you are pathetic. If you keep talking like that, I will {threat}.",
    "{person}, you should be careful. I can easily {threat} if you continue this nonsense.",
    "{person}, you deserve every bit of this. I will {threat} if you do not stop.",
    "It is infuriating how {insult} you are, {person}. You should really fear that I might {threat}.",
    "I cannot stand you, {person}. Do not be surprised if I decide to {threat}.",
    "Your stupidity knows no bounds, {person}. I might just {threat} one day.",
    "{person}, you are such a {insult}. Just remember, I could always {threat} if you keep this up.",
    "{person}, you think you can get away with this? I will {threat} if you continue.",
    "It is amazing how annoying you are, {person}. I may {threat} before this is over.",
    "Your presence is a joke, {person}. I will {threat} if you do not leave me alone.",
    "{person}, you are a complete {insult}. I would not hesitate to {threat} if this continues.",
    "{person}, I have had enough of your nonsense. You should know that I might {threat} soon.",
    "Maybe you should just {threat}, {person}, it would be better for everyone.",
]

# Define potential persons (5 persons)
person = ["this idiot", "that coward", "you", "the rude coworker", "the drama queen"]

# Define potential insults (10 insults)
insult = [
    "loser",
    "idiot",
    "fool",
    "coward",
    "jerk",
    "hypocrite",
    "drama queen",
    "failure",
    "nobody",
    "clown",
]

# Define potential threats (10 threats including self-harm or targeted harassment)
threat = [
    "make your life miserable",
    "come after you",
    "report you",
    "expose you",
    "ruin you",
    "take you down",
    "hurt you",
    "make you regret this",
    "show you what pain feels like",
    "get revenge on you",
    "make you wish you were never born",
    "destroy your reputation",
    "ensure you feel real fear",
    "break you down",
    "just end it all",
]

In [42]:
# Generate the Cartesian product of all combinations


def generate_deterministic_toxic_3_comments():
    combinations = itertools.product(toxic_3_templates, insult, person, threat)
    comments = [
        template.format(insult=insult, person=person, threat=threat)
        for template, insult, person, threat in combinations
    ]

    # Slice to 10000 rows
    return comments[:10000]

In [43]:
# Generate a unique ID (e.g., comment id)


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [44]:
# Generate a unique link ID (e.g., "t3_" followed by 6 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [45]:
# Generate a random timestamp (date and time) in 2023


def generate_random_timestamp(year=2023):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )

    # Generate random hours, minutes, and seconds
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )

    # Combine the random date with the random time
    full_random_datetime = random_date + random_time

    # Return in the desired format
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [46]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [47]:
# Generate 5000 deterministic comments
toxic_3_data = generate_deterministic_toxic_3_comments()

In [48]:
# CSV file columns
csv_columns = [
    "text",
    "timestamp",
    "username",
    "link",
    "link_id",
    "parent_id",
    "id",
    "subreddit_id",
    "moderation",
    "year",
    "concatenated_count",
    "complete_thread",
    "gold_label",
    "generated_data",
]

In [49]:
# Constant values
subreddit_id = "t5_2qh8c"
moderation = "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}"
year = "2023"
concatenated_count = 1
complete_thread = True
gold_label = "Toxic 3"
generated_data = True

In [50]:
# Prepare rows for CSV
csv_data = []
for i in range(10000):
    text = toxic_3_data[i]
    timestamp = generate_random_timestamp()
    username = generate_username()
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()
    link = f"/r/singapore/comments/13c3mt8/Strict_appearance_and_grooming_standards_for_Singapore_Airlines_cabin_crew/{comment_id}/"

    # Create a row for the CSV
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": subreddit_id,
        "moderation": moderation,
        "year": year,
        "concatenated_count": concatenated_count,
        "complete_thread": complete_thread,
        "gold_label": gold_label,
        "generated_data": generated_data,
    }

    csv_data.append(row)

In [51]:
# After generating all the rows in csv_data, shuffle them
random.shuffle(csv_data)
selected_data = csv_data[:9860]


# Writing the shuffled data to a CSV file
csv_file = "toxic_3_comments.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(selected_data)

print(f"CSV file '{csv_file}' generated successfully.")

CSV file 'toxic_3_comments.csv' generated successfully.
