In [None]:
from google.colab import files
uploaded = files.upload()


Saving complaint dataset.csv to complaint dataset.csv


In [None]:
import pandas as pd
import numpy as np


In [None]:
df = pd.read_csv("complaint dataset.csv")
df.head()


Unnamed: 0,complaint_text,department
0,Potholes have developed on the main road causi...,Road & Infrastructure
1,The road surface is damaged and needs urgent r...,Road & Infrastructure
2,Road near my house is broken and vehicles keep...,Road & Infrastructure
3,Huge potholes are making it hard to ride my bi...,Road & Infrastructure
4,Broken road is causing pain while walking to s...,Road & Infrastructure


In [None]:
df.info()
print(df['department'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   complaint_text  279 non-null    object
 1   department      279 non-null    object
dtypes: object(2)
memory usage: 4.5+ KB
department
Road & Infrastructure         78
Water Supply                  52
Electricity & Streetlights    50
Public Safety                 50
Garbage & Sanitation          49
Name: count, dtype: int64


In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c not in string.punctuation])
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)


In [None]:
df['clean_text'] = df['complaint_text'].apply(clean_text)
df[['complaint_text', 'clean_text']].head()


Unnamed: 0,complaint_text,clean_text
0,Potholes have developed on the main road causi...,potholes developed main road causing difficult...
1,The road surface is damaged and needs urgent r...,road surface damaged needs urgent repair
2,Road near my house is broken and vehicles keep...,road near house broken vehicles keep skidding
3,Huge potholes are making it hard to ride my bi...,huge potholes making hard ride bicycle
4,Broken road is causing pain while walking to s...,broken road causing pain walking school


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['department']


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))


Model Accuracy: 0.8035714285714286


In [None]:
keyword_map = {
    "Road & Infrastructure": [
        "road", "roads", "pothole", "bridge", "footpath", "traffic", "signal",
        "construction", "speed breaker", "manhole"
    ],
    "Water Supply": [
        "water", "tap", "pipeline", "leakage", "tank", "drinking", "supply"
    ],
    "Electricity & Streetlights": [
        "electricity", "power", "current", "streetlight", "pole", "wire",
        "transformer", "voltage"
    ],
    "Garbage & Sanitation": [
        "garbage", "waste", "trash", "dustbin", "drainage", "sanitation",
        "mosquito", "toilet"
    ],
    "Public Safety": [
        "unsafe", "theft", "fight", "accident", "harassment", "police",
        "security", "drunk"
    ]
}


In [None]:
def predict_department(complaint):
    cleaned = clean_text(complaint)

    # 1️⃣ Keyword check (priority)
    for department, keywords in keyword_map.items():
        for keyword in keywords:
            if keyword in cleaned:
                return department

    # 2️⃣ ML prediction (fallback)
    vector = vectorizer.transform([cleaned])
    return model.predict(vector)[0]


In [None]:
test_complaints = [
    "Road near my house is full of potholes",
    "Garbage waste not collected for days",
    "No water supply since morning",
    "Streetlight is not working",
    "I feel unsafe walking at night"
]

for c in test_complaints:
    print(c, "→", predict_department(c))


Road near my house is full of potholes → Road & Infrastructure
Garbage waste not collected for days → Garbage & Sanitation
No water supply since morning → Water Supply
Streetlight is not working → Electricity & Streetlights
I feel unsafe walking at night → Public Safety


In [None]:
import joblib

joblib.dump(model, "complaint_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(keyword_map, "keyword_map.pkl")


['keyword_map.pkl']

In [None]:
files.download("complaint_model.pkl")
files.download("vectorizer.pkl")
files.download("keyword_map.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>