In [4]:
import pandas as pd

In [5]:
data = {
    "text": [
        "There's a large pothole on Maple Street causing traffic jams",
        "Garbage has been piling up near the park for 5 days",
        "Broken traffic signal at Oak Avenue and Pine Road",
        "Stray dogs roaming near the school playground",
        "Streetlight not working in Sector 12 after the storm",
        "Illegal construction blocking pedestrian pathways",
        "Sewage overflow in apartment complex Block C",
        "Missing manhole cover near the bus station",
        "Loud noise from unauthorized late-night events",
        "Public benches damaged in the city square"
    ],
    "label": [
        "INFRASTRUCTURE",
        "SANITATION",
        "PUBLIC_SAFETY",
        "PUBLIC_SAFETY",
        "INFRASTRUCTURE",
        "INFRASTRUCTURE",
        "SANITATION",
        "INFRASTRUCTURE",
        "OTHER",
        "INFRASTRUCTURE"
    ]
}

df = pd.DataFrame(data)
print(df)

                                                text           label
0  There's a large pothole on Maple Street causin...  INFRASTRUCTURE
1  Garbage has been piling up near the park for 5...      SANITATION
2  Broken traffic signal at Oak Avenue and Pine Road   PUBLIC_SAFETY
3      Stray dogs roaming near the school playground   PUBLIC_SAFETY
4  Streetlight not working in Sector 12 after the...  INFRASTRUCTURE
5  Illegal construction blocking pedestrian pathways  INFRASTRUCTURE
6       Sewage overflow in apartment complex Block C      SANITATION
7         Missing manhole cover near the bus station  INFRASTRUCTURE
8     Loud noise from unauthorized late-night events           OTHER
9          Public benches damaged in the city square  INFRASTRUCTURE


In [8]:
import pandas as pd
import random

# Seed for reproducibility
random.seed(42)

# Templates for synthetic data generation
infra_templates = [
    "Large pothole on {street} causing traffic disruptions",
    "Broken streetlight at {location}",
    "Cracked sidewalk tiles near {landmark}",
    "Damaged road signs on {road}",
    "Unfinished road construction blocking {area}",
    "Flooded street near {location} after heavy rain",
    "Collapsed drainage system in {area}",
    "Illegal speed bump installed on {street}",
    "Missing guardrails on {road} bridge",
    "Graffiti covering traffic signs at {location}"
]

sanitation_templates = [
    "Garbage pile-up near {landmark} for {days} days",
    "Sewage overflow in {area} residential zone",
    "Foul smell from uncollected waste near {location}",
    "Trash cans overflowing at {landmark}",
    "Stagnant water causing mosquito breeding in {area}",
    "Animal carcass left unattended on {street}",
    "Medical waste dumped near {location}",
    "Industrial waste discharge into {water_body}",
    "Public toilets unusable at {landmark}",
    "Food waste attracting stray animals in {area}"
]

safety_templates = [
    "Malfunctioning traffic signal at {intersection}",
    "Unsafe electrical wiring near {location}",
    "Stray dogs attacking pedestrians in {area}",
    "Missing pedestrian crossing signs on {road}",
    "Unlicensed street vendors blocking {location}",
    "Fire hazard due to illegal parking at {landmark}",
    "Broken staircase railing in {public_space}",
    "Aggressive street harassment near {location}",
    "Unprotected construction site at {area}",
    "Expired fire extinguishers in {building}"
]

other_templates = [
    "Noise pollution from {source} during night hours",
    "Vandalism of public property at {location}",
    "Unauthorized advertising hoardings in {area}",
    "Abandoned vehicles on {street}",
    "Illegal tree cutting near {landmark}",
    "Public park benches removed from {location}",
    "Unauthorized religious processions blocking {road}",
    "Misuse of disability parking spots at {landmark}",
    "Loudspeaker violations in {area}",
    "Defaced historical monument at {location}"
]

# Fillers for template placeholders
locations = ["Main Market", "Central Square", "Riverfront", "City Hospital",
            "Tech Park", "Old Town", "Railway Station", "Bus Depot",
            "Children's Park", "Government Colony"]

streets = ["Oak Street", "Maple Avenue", "Pine Road", "Cedar Lane",
          "Elm Boulevard", "Birch Circle", "Willow Drive", "Ash Terrace"]

landmarks = ["City Hall", "Public Library", "Grand Hotel", "Sunrise Mall",
            "Community Center", "Sports Stadium", "Central Park"]

# Generate synthetic data
def generate_data(templates, label, num_samples=25):
    data = []
    for _ in range(num_samples):
        template = random.choice(templates)
        entry = template.format(
            street=random.choice(streets),
            location=random.choice(locations),
            landmark=random.choice(landmarks),
            road=random.choice(streets),
            area=random.choice(["North Zone", "South Zone", "East Zone", "West Zone"]),
            days=random.randint(2, 7),
            water_body=random.choice(["Lake Victoria", "Green River", "City Canal"]),
            intersection=random.choice(streets) + " & " + random.choice(streets),
            public_space=random.choice(["Community Park", "Shopping Complex", "Metro Station"]),
            building=random.choice(["City Hospital", "Public Library", "Town Hall"]),
            source=random.choice(["construction", "late-night parties", "street performers"])
        )
        data.append({"text": entry, "label": label})
    return data

# Generate dataset
full_data = []
full_data += generate_data(infra_templates, "INFRASTRUCTURE")
full_data += generate_data(sanitation_templates, "SANITATION")
full_data += generate_data(safety_templates, "PUBLIC_SAFETY")
full_data += generate_data(other_templates, "OTHER")

# Create DataFrame
df = pd.DataFrame(full_data)

# Save to CSV
df.to_csv("../data/grievance_dataset.csv", index=False)
print("Dataset generated with 100 samples!")

Dataset generated with 100 samples!


In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
train_test = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Model setup
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=4,
    id2label={0: "INFRASTRUCTURE", 1: "SANITATION", 2: "PUBLIC_SAFETY", 3: "OTHER"},
    label2id={"INFRASTRUCTURE":0, "SANITATION":1, "PUBLIC_SAFETY":2, "OTHER":3}
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    logging_steps=10,
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()