In [28]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [29]:
# Expanded templates for Hate 2 sentences (50 templates)
hate_2_templates = [
    "Why are {sensitive_groups} allowed to {action} at hawker centres? Not cool lah.",
    "Every time I see {sensitive_groups} {action}, I just want to shake my head.",
    "Can {sensitive_groups} stop {action} during MRT rides? Its so inconsiderate.",
    "I really dont understand why {sensitive_groups} think its okay to {action} in public.",
    "Why must {sensitive_groups} always {action} when were just trying to enjoy our kopi?",
    "{sensitive_groups} who {action} at events make things so awkward, please dont.",
    "I cant stand how {sensitive_groups} think they can {action} anywhere, its not right.",
    "Can we please address how {sensitive_groups} keep {action} at our neighbourhood parks?",
    "Its time {sensitive_groups} realize that {action} isnt welcome in our society.",
    "Why do {sensitive_groups} always feel entitled to {action} during public gatherings?",
    "I find it hard to respect {sensitive_groups} who {action} without thinking of others.",
    "Every time {sensitive_groups} {action}, it just adds to the frustration in our community.",
    "We really need to talk about how {sensitive_groups} keep {action} at our events.",
    "Can {sensitive_groups} not see how their {action} is affecting our community vibes?",
    "Its so frustrating when {sensitive_groups} {action} without any regard for others.",
    "People have had enough of {sensitive_groups} {action} in our shared spaces.",
    "I wish {sensitive_groups} would just think twice before {action} in public.",
    "Why do {sensitive_groups} always think its fine to {action} in front of everyone?",
    "Its disappointing that {sensitive_groups} persist in {action} when weve already complained.",
    "How can {sensitive_groups} not realize that their {action} impacts the rest of us?",
    "When will {sensitive_groups} understand that {action} is just not okay in Singapore?",
    "Its really embarrassing to see {sensitive_groups} {action} during our festivals.",
    "Can we just agree that {sensitive_groups} who {action} should be more considerate?",
    "I dont get why {sensitive_groups} always feel they can {action} without facing backlash.",
    "Why cant {sensitive_groups} see that their {action} disrupts our harmony?",
    "Its tiring to see {sensitive_groups} constantly {action} without learning from it.",
    "Why are {sensitive_groups} always {action} at the worst possible times?",
    "When will {sensitive_groups} learn that {action} is not acceptable in our community?",
    "Can {sensitive_groups} stop {action} at our neighbourhood kopitiams? Its so disruptive!",
    "How can we tolerate {sensitive_groups} who {action} in our public spaces?",
    "Every time I see {sensitive_groups} {action}, it just makes my blood boil.",
]


# Expanded actions for Hate 2 (20 actions)
hate_2_actions = [
    "taking up space",
    "disrupting our peace",
    "acting superior",
    "complaining loudly",
    "not respecting our norms",
    "refusing to adapt",
    "pushing their culture on us",
    "making a scene",
    "being inconsiderate",
    "causing discomfort",
    "spreading negativity",
    "belittling others",
    "invading personal space",
    "disregarding boundaries",
    "showing hostility",
    "fostering division",
    "mocking differences",
    "ignoring our values",
    "exhibiting entitlement",
    "creating tension",
]


# Expanded groups list (15 groups)
sensitive_groups = [
    "British",
    "Chinese",
    "Malays",
    "Indians",
    "Black-skinned",
    "Buddhist",
    "Muslim",
    "LGBTQ+",
    "Transgender",
    "Foreigners",
    "Refugees",
    "Disabled",
    "Elderly",
    "Non-binary",
    "Low-income",
]

In [30]:
# Generate the Cartesian product of all combinations for Hate 2


def generate_deterministic_hate_2_comments():
    combinations = itertools.product(hate_2_templates, sensitive_groups, hate_2_actions)
    comments = [
        template.format(sensitive_groups=sensitive_groups, action=action)
        for template, sensitive_groups, action in combinations
    ]

    return comments

In [31]:
# Generate a unique comment ID without prefix, following a 7-character length


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))


# Generate a unique link ID (e.g., "t3_" followed by 7 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [32]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [33]:
# Generate a random timestamp (date and time) in 2022


def generate_random_timestamp(year=2022):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )
    full_random_datetime = random_date + random_time
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [34]:
# Generate enough comments to hit 5000 rows by looping through combinations
hate_2_data = generate_deterministic_hate_2_comments()

In [35]:
# Ensure there are 5000 rows by looping through the generated combinations
csv_data_hate_2 = []
for i in range(15000):
    text = hate_2_data[
        i % len(hate_2_data)
    ]  # Repeat combinations if fewer than 5000 unique rows
    timestamp = generate_random_timestamp()
    username = generate_username()  # Random 6-letter username
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()  # No prefix, just a 7-character ID
    link = f"/r/singapore/comments/13c3mt8/stupid_people_on_the_streets/{comment_id}/"

    # Create a row for the CSV with columns matching balanced_data
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": "t5_2qh8c",
        "moderation": "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}",
        "year": 2022,
        "word_count": len(text.split()),  # Calculate word count from text
        "Sensitive Group": None,  # Or provide a value if needed
        "Classification": "Hate 2",  # Set to Hate 2
    }

    csv_data_hate_2.append(row)

In [36]:
# Shuffle the rows
random.shuffle(csv_data_hate_2)

# Sample 4033 rows
selected_data = csv_data_hate_2[:14033]

# Writing the shuffled and sampled Hate 2 data to a CSV file
csv_file_hate_2 = "hate_2_comments.csv"
with open(csv_file_hate_2, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_data_hate_2[0].keys())
    writer.writeheader()
    writer.writerows(selected_data)  # Use selected_data instead of csv_data_hate_2

print(f"CSV file '{csv_file_hate_2}' generated successfully.")

CSV file 'hate_2_comments.csv' generated successfully.
