In [173]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [174]:
# Define templates for Toxic 1 sentences (28 templates)
toxic_1_templates = [
    "This is so {complaint}, I do not get why {person} even bothers.",
    "Honestly, {person} is just making it {complaint}, and it is frustrating.",
    "It is ridiculous how {person} always finds a way to make things {complaint}.",
    "Why does {person} always have to make everything so {complaint}?",
    "This situation with {person} is just {complaint}, and I cannot understand it.",
    "The way things are going makes everything feel {complaint} lately.",
    "It is hard to believe that {person} thinks this is acceptable when it is {complaint}.",
    "Every time I see {person}, it feels like everything is just {complaint}.",
    "Why must {person} insist on making things so {complaint}?",
    "Honestly, I find it {complaint} how {person} handles situations.",
    "It feels like {person} is trying to make everything {complaint}.",
    "Why is it that {person} always makes a simple task feel {complaint}?",
    "The way {person} approaches this is just {complaint}.",
    "Can someone explain why {person} thinks this is not {complaint}?",
    "It is disappointing to see how {person} manages to make everything {complaint}.",
    "I do not understand how {person} can be so {complaint}.",
    "It baffles me that {person} has not figured out that this is {complaint}.",
    "Every time {person} is involved, it just becomes {complaint}.",
    "I do not get how {person} can make things feel so {complaint}.",
    "Why does it seem like {person} thrives on making things {complaint}?",
    "This situation just goes to show how {person} makes it {complaint}.",
    "How is it possible that {person} is unaware of how {complaint} things are?",
    "It is beyond me why {person} chooses to act this way, given how {complaint} it is.",
    "I cannot fathom why {person} is okay with this being {complaint}.",
    "It is frustrating that {person} has not recognized this as {complaint}.",
    "Why does {person} think it is fine to make everything so {complaint}?",
    "I wish {person} would see how {complaint} their actions are.",
    "The level of {complaint} in {person}'s approach is astonishing.",
    "Can {person} not see how {complaint} their actions are?",
    "Every interaction with {person} feels like a step toward {complaint}.",
    "Why does {person} seem to enjoy making things {complaint}?",
]

# Define potential complaints (30 complaints)
complaint = [
    "pointless",
    "a waste of time",
    "unnecessary",
    "ridiculous",
    "confusing",
    "overly complicated",
    "frustrating",
    "annoying",
    "inconvenient",
    "meaningless",
    "unproductive",
    "unhelpful",
    "time-consuming",
    "disorganized",
    "difficult",
    "exhausting",
    "misguided",
    "infuriating",
    "disappointing",
    "stupid",
    "nonsense",
    "tedious",
    "unreasonable",
    "unacceptable",
    "insulting",
    "overwhelming",
    "unfair",
    "unjustified",
    "excessive",
    "unwarranted",
]

# Define potential persons (6 persons)
person = [
    "he",
    "she",
    "this person",
    "my colleague",
    "that guy",
    "my friend",
]

In [175]:
# Generate the Cartesian product of all combinations


def generate_deterministic_toxic_1_comments():
    combinations = itertools.product(toxic_1_templates, person, complaint)
    comments = [
        template.format(person=person, complaint=complaint)
        for template, person, complaint in combinations
    ]

    # Slice to 5000 rows
    return comments[:5040]

In [176]:
# Generate a unique ID (e.g., comment id)


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [177]:
# Generate a unique link ID (e.g., "t3_" followed by 6 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [178]:
# Generate a random timestamp (date and time) in 2023


def generate_random_timestamp(year=2023):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )

    # Generate random hours, minutes, and seconds
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )

    # Combine the random date with the random time
    full_random_datetime = random_date + random_time

    # Return in the desired format
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [179]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [180]:
# Generate 5000 deterministic comments
toxic_1_data = generate_deterministic_toxic_1_comments()

In [181]:
# CSV file columns
csv_columns = [
    "text",
    "timestamp",
    "username",
    "link",
    "link_id",
    "parent_id",
    "id",
    "subreddit_id",
    "moderation",
    "year",
    "concatenated_count",
    "complete_thread",
    "gold_label",
    "generated_data",
]

In [182]:
# Constant values
subreddit_id = "t5_2qh8c"
moderation = "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}"
year = "2021"
concatenated_count = 1
complete_thread = True
gold_label = "Toxic 1"
generated_data = True

In [183]:
# Prepare rows for CSV
csv_data = []
for i in range(5040):
    text = toxic_1_data[i]
    timestamp = generate_random_timestamp()
    username = generate_username()
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()
    link = f"/r/singapore/comments/13c3mt8/Strict_appearance_and_grooming_standards_for_Singapore_Airlines_cabin_crew/{comment_id}/"

    # Create a row for the CSV
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": subreddit_id,
        "moderation": moderation,
        "year": year,
        "concatenated_count": concatenated_count,
        "complete_thread": complete_thread,
        "gold_label": gold_label,
        "generated_data": generated_data,
    }

    csv_data.append(row)

In [184]:
# After generating all the rows in csv_data, shuffle them
random.shuffle(csv_data)

# Writing the shuffled data to a CSV file
csv_file = "toxic_1_comments.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(csv_data)

print(f"CSV file '{csv_file}' generated successfully.")

CSV file 'toxic_1_comments.csv' generated successfully.
