# NLP Complaint Topic Analysis

Analyzing a collection of municipal complaints using NLP, vectorization, and topic modeling to identify pressing public issues.

In [4]:
# Import Libraries

import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim import corpora, models
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis.gensim_models
import pyLDAvis

In [5]:
# Data Loading

data_path = '../data/complaints_sample.csv'
df = pd.read_csv(data_path)
df = df.dropna(subset=['complaint_text'])
df.head()

Unnamed: 0,complaint_text
0,The garbage bins are not emptied regularly and...
1,Late responses to urgent requests for pothole ...
2,Noise at night from roadwork near my home.
3,The recycling containers are too few and almos...
4,Street lights are broken in several places mak...


In [6]:
# Text Processing

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    doc = nlp(str(text).lower())
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

df['clean_text'] = df['complaint_text'].map(clean_text)
df[['complaint_text', 'clean_text']].head()

Unnamed: 0,complaint_text,clean_text
0,The garbage bins are not emptied regularly and...,garbage bin empty regularly street dirty
1,Late responses to urgent requests for pothole ...,late response urgent request pothole repair
2,Noise at night from roadwork near my home.,noise night roadwork near home
3,The recycling containers are too few and almos...,recycling container
4,Street lights are broken in several places mak...,street light break place make unsafe dark


In [7]:
# TF-IDF Vectorization

tfidf = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf_matrix = tfidf.fit_transform(df['clean_text'])
tfidf_feature_names = tfidf.get_feature_names_out()

In [8]:
# spaCy Word Embeddings

def get_vector(doc):
    doc_spacy = nlp(doc)
    return doc_spacy.vector

embeddings = np.vstack(df['clean_text'].apply(get_vector))

In [9]:
# LDA Topic Modeling

texts = [doc.split() for doc in df['clean_text']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda_model = models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10)

# Show LDA topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.101*"recycling" + 0.101*"container" + 0.035*"roadwork" + 0.035*"noise" + 0.035*"near"')
(1, '0.069*"street" + 0.041*"bin" + 0.041*"empty" + 0.041*"garbage" + 0.041*"regularly"')


In [10]:
# NMF Topic Modeling

n_topics = 2
nmf = NMF(n_components=n_topics, random_state=42)
nmf_W = nmf.fit_transform(tfidf_matrix)
nmf_H = nmf.components_

# Show NMF topics
for idx, topic in enumerate(nmf_H):
    print("NMF Topic #%d: %s" % (idx, " ".join([tfidf_feature_names[i] for i in topic.argsort()[:-6:-1]])))

NMF Topic #0: street
NMF Topic #1: street


In [11]:
# Visualizing LDA Topics

pyLDAvis.enable_notebook()
lda_vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(lda_vis, '../results/lda_topic_visualization.html')

## Results and Discussion

Both LDA and NMF identified recurring themes in the complaints, including waste management, road repairs, noise issues, and public lighting. TF-IDF highlighted distinctive terms, while embeddings confirmed semantic groupings. Data preparation and cleaning were essential for meaningful topic discovery.