In [None]:
import pandas as pd
import random

In [None]:
import pandas as pd
import random

# Seed for reproducibility
random.seed(42)

# Templates for synthetic data generation
infra_templates = [
    "Large pothole on {street} causing traffic disruptions",
    "Broken streetlight at {location}",
    "Cracked sidewalk tiles near {landmark}",
    "Damaged road signs on {road}",
    "Unfinished road construction blocking {area}",
    "Flooded street near {location} after heavy rain",
    "Collapsed drainage system in {area}",
    "Illegal speed bump installed on {street}",
    "Missing guardrails on {road} bridge",
    "Graffiti covering traffic signs at {location}"
]

sanitation_templates = [
    "Garbage pile-up near {landmark} for {days} days",
    "Sewage overflow in {area} residential zone",
    "Foul smell from uncollected waste near {location}",
    "Trash cans overflowing at {landmark}",
    "Stagnant water causing mosquito breeding in {area}",
    "Animal carcass left unattended on {street}",
    "Medical waste dumped near {location}",
    "Industrial waste discharge into {water_body}",
    "Public toilets unusable at {landmark}",
    "Food waste attracting stray animals in {area}"
]

safety_templates = [
    "Malfunctioning traffic signal at {intersection}",
    "Unsafe electrical wiring near {location}",
    "Stray dogs attacking pedestrians in {area}",
    "Missing pedestrian crossing signs on {road}",
    "Unlicensed street vendors blocking {location}",
    "Fire hazard due to illegal parking at {landmark}",
    "Broken staircase railing in {public_space}",
    "Aggressive street harassment near {location}",
    "Unprotected construction site at {area}",
    "Expired fire extinguishers in {building}"
]

other_templates = [
    "Noise pollution from {source} during night hours",
    "Vandalism of public property at {location}",
    "Unauthorized advertising hoardings in {area}",
    "Abandoned vehicles on {street}",
    "Illegal tree cutting near {landmark}",
    "Public park benches removed from {location}",
    "Unauthorized religious processions blocking {road}",
    "Misuse of disability parking spots at {landmark}",
    "Loudspeaker violations in {area}",
    "Defaced historical monument at {location}"
]

# Fillers for template placeholders
locations = ["Main Market", "Central Square", "Riverfront", "City Hospital",
            "Tech Park", "Old Town", "Railway Station", "Bus Depot",
            "Children's Park", "Government Colony"]

streets = ["Oak Street", "Maple Avenue", "Pine Road", "Cedar Lane",
          "Elm Boulevard", "Birch Circle", "Willow Drive", "Ash Terrace"]

landmarks = ["City Hall", "Public Library", "Grand Hotel", "Sunrise Mall",
            "Community Center", "Sports Stadium", "Central Park"]

# Generate synthetic data
def generate_data(templates, label, num_samples=25):
    data = []
    for _ in range(num_samples):
        template = random.choice(templates)
        entry = template.format(
            street=random.choice(streets),
            location=random.choice(locations),
            landmark=random.choice(landmarks),
            road=random.choice(streets),
            area=random.choice(["North Zone", "South Zone", "East Zone", "West Zone"]),
            days=random.randint(2, 7),
            water_body=random.choice(["Lake Victoria", "Green River", "City Canal"]),
            intersection=random.choice(streets) + " & " + random.choice(streets),
            public_space=random.choice(["Community Park", "Shopping Complex", "Metro Station"]),
            building=random.choice(["City Hospital", "Public Library", "Town Hall"]),
            source=random.choice(["construction", "late-night parties", "street performers"])
        )
        data.append({"text": entry, "label": label})
    return data

# Generate dataset
full_data = []
full_data += generate_data(infra_templates, "INFRASTRUCTURE")
full_data += generate_data(sanitation_templates, "SANITATION")
full_data += generate_data(safety_templates, "PUBLIC_SAFETY")
full_data += generate_data(other_templates, "OTHER")

# Create DataFrame
df = pd.DataFrame(full_data)

# Save to CSV
df.to_csv("../data/grievance_dataset.csv", index=False)
print("Dataset generated with 100 samples!")

In [None]:
from datasets import Dataset

label2id = {
    "INFRASTRUCTURE": 0,
    "SANITATION": 1,
    "PUBLIC_SAFETY": 2,
    "OTHER": 3
}
df["labels"] = df["label"].map(label2id)

# Create Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "labels"]])

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Add labels to the tokenized output
    tokenized["labels"] = examples["labels"]
    return tokenized

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
print(tokenized_dataset.features)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Split dataset
train_test = tokenized_dataset.train_test_split(test_size=0.2)

# Model initialization
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=4,
    id2label={v: k for k, v in label2id.items()}
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test["train"],
    eval_dataset=train_test["test"],
)

# Train
trainer.train()

In [None]:
# Save model and tokenizer
model.save_pretrained("../data/grievance_classifier")
tokenizer.save_pretrained("../data/grievance_classifier")

In [None]:
# Evaluate
results = trainer.evaluate()
print(f"Validation accuracy: {results['eval_loss']}")

# Inference Example
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
)

test_cases = [
    "Overflowing trash cans near the market",
    "Cracked sidewalk tiles on Central Avenue",
    "Unsafe electrical wires hanging low near playground"
]

for case in test_cases:
    result = classifier(case)[0]
    print(f"Input: {case}\nPredicted: {result['label']} (Confidence: {result['score']:.2f})\n")