In [1]:
import csv
import itertools
import random
from datetime import datetime, timedelta

In [2]:
# Define informal templates for Hate 3 sentences (15 Singaporean-English mix)
hate_3_templates = [
    "If {sensitive_groups} dont {action}, theyre gonna {threat}.",
    "Only way to deal with {sensitive_groups} not {action} is to {threat}.",
    "{sensitive_groups} not {action}? Yeah, theyll {threat} real soon.",
    "If {sensitive_groups} keep {action}, Ill {threat}.",
    "Im just waiting for {sensitive_groups} to {action}, or else theyll {threat}.",
    "When {sensitive_groups} {action}, theyll regret it 'cause Ill {threat}.",
    "Watch {sensitive_groups} {action} and then {threat} happens.",
    "Gonna {threat} if {sensitive_groups} dont {action}.",
    "{sensitive_groups} better {action}, or its {threat} for them.",
    "{sensitive_groups} must {action}, or they will definitely {threat}.",
    "If {sensitive_groups} dont {action}, theyll be in for {threat}.",
    "{sensitive_groups} better {action}, or theyll face {threat} soon.",
    "No chance for {sensitive_groups} to avoid {threat} if they dont {action}.",
    "{sensitive_groups} will {threat} if they dont start {action}.",
    "As soon as {sensitive_groups} stop {action}, theyll kena {threat}.",
]

# List of 10 sensitive groups
sensitive_groups = [
    "Burmese",
    "Malays",
    "Indians",
    "Christian",
    "Jews",
    "Lesbian",
    "Gay",
    "Disabled",
    "Poor",
    "Immigrant",
]

# Define varied actions (10 Singaporean-English mix)
actions = [
    "pack up and leave",
    "stop causing trouble",
    "get out of this country",
    "stop ruining everything",
    "stay away from us",
    "stop being so kaypoh",
    "stay out of our lives",
    "go back to wherever they came from",
    "get out of our way",
    "stop taking our jobs",
]

# Define more varied informal threats (10 threats)
threats = [
    "regret messing with us",
    "be gone forever",
    "make their lives miserable",
    "regret sticking around",
    "wish they never showed up",
    "lose everything they have",
    "end up in shit",
    "burn their houses",
    "get completely destroyed",
    "break their bones",
]

In [3]:
# Generate the Cartesian product of all combinations


def generate_deterministic_hate_3_comments():
    # Create combinations using the product of templates, groups, actions, and threats
    combinations = itertools.product(
        hate_3_templates, sensitive_groups, actions, threats
    )

    # Format the combinations with the correct placeholders for 'sensitive_groups', 'action', and 'threat'
    comments = [
        template.format(sensitive_groups=sensitive_group, action=action, threat=threat)
        for template, sensitive_group, action, threat in combinations
    ]

    return comments

In [4]:
# Generate a unique ID (e.g., comment id)


def generate_unique_comment_id():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [5]:
# Generate a unique link ID (e.g., "t3_" followed by 6 characters)


def generate_unique_link_id():
    return "t3_" + "".join(random.choices("abcdefghijklmnopqrstuvwxyz0123456789", k=7))

In [6]:
# Generate a random timestamp (date and time) in 2023


def generate_random_timestamp(year=2023):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    random_date = start_date + timedelta(
        days=random.randint(0, (end_date - start_date).days)
    )

    # Generate random hours, minutes, and seconds
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59),
    )

    # Combine the random date with the random time
    full_random_datetime = random_date + random_time

    # Return in the desired format
    return full_random_datetime.strftime("%m/%d/%Y %I:%M:%S %p")

In [7]:
# Generate a random 6-letter username (alphabets only)


def generate_username():
    return "".join(random.choices("abcdefghijklmnopqrstuvwxyz", k=6))

In [8]:
# Generate 5000 deterministic comments
hate_3_data = generate_deterministic_hate_3_comments()

In [9]:
# CSV file columns
csv_columns = [
    "text",
    "timestamp",
    "username",
    "link",
    "link_id",
    "parent_id",
    "id",
    "subreddit_id",
    "moderation",
    "year",
    "word_count",
    "Sensitive Group",
    "Classification",
]

In [10]:
# Constant values
subreddit_id = "t5_2qh8c"
moderation = "{'controversiality': 0, 'collapsed_reason_code': None, 'collapsed': False, 'collapsed_reason': None}"
year = "2023"
classification = "Hate 3"  # Change from gold_label to Classification

In [11]:
# Prepare rows for CSV
csv_data = []
for i in range(15000):
    text = hate_3_data[i]
    timestamp = generate_random_timestamp()
    username = generate_username()
    link_id = generate_unique_link_id()
    parent_id = link_id  # Same within each row
    comment_id = generate_unique_comment_id()
    link = f"/r/singapore/comments/13c3mt8/Strict_appearance_and_grooming_standards_for_Singapore_Airlines_cabin_crew/{comment_id}/"

    # Create a row for the CSV with columns matching balanced_data
    row = {
        "text": text,
        "timestamp": timestamp,
        "username": username,
        "link": link,
        "link_id": link_id,
        "parent_id": parent_id,
        "id": comment_id,
        "subreddit_id": subreddit_id,
        "moderation": moderation,
        "year": year,
        "word_count": len(text.split()),  # Assuming you want to keep the word count
        "Sensitive Group": None,  # Or provide a value if needed
        "Classification": classification,  # Use the appropriate classification
    }

    csv_data.append(row)

In [12]:
# After generating all the rows in csv_data, shuffle them
random.shuffle(csv_data)

# Randomly sample 9861 rows
sampled_data = random.sample(csv_data, 14779)

# Define the column headers for the CSV
csv_columns = csv_data[0].keys()

# Writing the shuffled and sampled data to a CSV file
csv_file = "hate_3_comments.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    writer.writerows(sampled_data)

print(f"CSV file '{csv_file}' with 156000 rows generated successfully.")

CSV file 'hate_3_comments.csv' with 156000 rows generated successfully.
