In [98]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [99]:
# Define templates for Toxic 1 sentences (6 templates)
toxic_1_templates = [
    "This is so {complaint}, I don't get why {person} even bothers.",
    "Honestly, {person} is just {complaint}, and it is frustrating.",
    "It is ridiculous how {complaint} {person} is acting.",
    "Why is {person} always so {complaint}?",
    "This situation with {person} is just {complaint}, such a waste of time.",
    "The way things are going is leading to a lot of {complaint} lately.",
]

# Define potential complaints (30 complaints)
complaint = [
    "pointless",
    "a waste of time",
    "useless",
    "frustrating",
    "annoying",
    "irrelevant",
    "unnecessary",
    "ridiculous",
    "boring",
    "meaningless",
    "unproductive",
    "exhausting",
    "infuriating",
    "unhelpful",
    "time-consuming",
    "confusing",
    "unfair",
    "overwhelming",
    "disappointing",
    "stupid",
    "nonsense",
    "tedious",
    "unreasonable",
    "unacceptable",
    "insulting",
    "difficult",
    "disorganized",
    "poorly executed",
    "misguided",
    "inconsistent",
]

# Define potential persons (28 persons)
person = [
    "he",
    "she",
    "this person",
    "my colleague",
    "that guy",
    "that girl",
    "the manager",
    "my friend",
    "the client",
    "the supervisor",
    "the director",
    "the employee",
    "the staff",
    "my boss",
    "that team member",
    "the intern",
    "that customer",
    "that consultant",
    "the analyst",
    "the assistant",
    "the team leader",
    "someone",
    "this colleague",
    "that associate",
    "that team member",
    "the individual",
    "that worker",
    "the team member",
    "the participant",
]

In [100]:
# Generate the Cartesian product of all combinations


def generate_deterministic_toxic_1_comments():
    combinations = itertools.product(toxic_1_templates, person, complaint)
    comments = [
        template.format(person=person, complaint=complaint)
        for template, person, complaint in combinations
    ]

    # Slice to 5000 rows
    return comments[:5040]

In [101]:
# Generate a unique ID (e.g., comment id)


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [102]:
# Generate a unique link ID (e.g., "t3_" followed by 6 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [103]:
# Generate a random timestamp (date and time) in 2023


def generate_random_timestamp(year=2023):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )

    # Generate random hours, minutes, and seconds
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )

    # Combine the random date with the random time
    full_random_datetime = random_date + random_time

    # Return in the desired format
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [104]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [105]:
# Generate 5000 deterministic comments
toxic_1_data = generate_deterministic_toxic_1_comments()

In [106]:
# CSV file columns
csv_columns = [
    "text",
    "timestamp",
    "username",
    "link",
    "link_id",
    "parent_id",
    "id",
    "subreddit_id",
    "moderation",
    "year",
    "concatenated_count",
    "complete_thread",
    "gold_label",
]

In [107]:
# Constant values
subreddit_id = "t5_2qh8c"
moderation = "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}"
year = "2021"
concatenated_count = 1
complete_thread = True
gold_label = "Toxic 1"

In [108]:
# Prepare rows for CSV
csv_data = []
for i in range(5040):
    text = toxic_1_data[i]
    timestamp = generate_random_timestamp()
    username = generate_username()
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()
    link = f"/r/singapore/comments/13c3mt8/Strict_appearance_and_grooming_standards_for_Singapore_Airlines_cabin_crew/{comment_id}/"

    # Create a row for the CSV
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": subreddit_id,
        "moderation": moderation,
        "year": year,
        "concatenated_count": concatenated_count,
        "complete_thread": complete_thread,
        "gold_label": gold_label,
    }

    csv_data.append(row)

In [109]:
print(len(toxic_1_data))  # Check how many items are in the list

5040


In [110]:
# After generating all the rows in csv_data, shuffle them
random.shuffle(csv_data)

# Writing the shuffled data to a CSV file
csv_file = "toxic_1_comments.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(csv_data)

print(f"CSV file '{csv_file}' generated successfully.")

CSV file 'toxic_1_comments.csv' generated successfully.
