
# UrbanTech Feedback Classification Lab (Standalone Notebook)

This notebook is **self-contained** and includes all code directly (no references to other notebooks).
It implements an end-to-end Naive Bayes classifier for UrbanTech user feedback:

1) Data Loading & Exploration  
2) Text Preprocessing  
3) Feature Extraction (BoW & TF-IDF)  
4) Modeling with Naive Bayes + Top Words  
5) Evaluation (reports + confusion matrices)  
6) Model Improvement (GridSearchCV)  
7) Prediction Wrapper & Artifacts

**CSV search order:** `/mnt/data/urban_feedback.csv`, `/Users/karlkurzius/Downloads/urban_feedback.csv`, `./urban_feedback.csv`.


In [None]:

#!/usr/bin/env python3
# Step 0: Imports & utilities
import os, re, json, string
from typing import List, Optional, Dict, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import (classification_report, confusion_matrix, accuracy_score,
                             precision_recall_fscore_support)
import joblib

plt.rcParams["figure.figsize"] = (7,5)

def _ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)
    return p

def save_fig(name: str):
    outdir = _ensure_dir("./figures")
    path = os.path.join(outdir, name)
    plt.tight_layout()
    plt.savefig(path, dpi=140)
    print(f"[Saved figure] {path}")


## Step 1: Data Loading & Exploration

In [None]:

# Locate and load dataset
csv_candidates = [
    "/mnt/data/urban_feedback.csv",
    "/Users/karlkurzius/Downloads/urban_feedback.csv",
    "./urban_feedback.csv",
]
csv_path = None
for p in csv_candidates:
    if os.path.exists(p):
        csv_path = p
        break
if csv_path is None:
    raise FileNotFoundError("Dataset not found. Expected at /mnt/data/urban_feedback.csv or /Users/karlkurzius/Downloads/urban_feedback.csv or ./urban_feedback.csv")

df = pd.read_csv(csv_path)

# Auto-detect text and label columns
text_col_candidates = [c for c in df.columns if c.lower() in ["text","message","feedback","comment","body","review"] or "text" in c.lower() or "feedback" in c.lower() or "message" in c.lower()]
label_col_candidates = [c for c in df.columns if c.lower() in ["label","category","department","target","class"] or "category" in c.lower() or "label" in c.lower() or "dept" in c.lower()]

if not text_col_candidates:
    obj_cols = [c for c in df.columns if df[c].dtype == "O"]
    if not obj_cols:
        raise ValueError("No obvious text column found; rename your text column to 'text' or similar.")
    text_col_candidates = obj_cols[:1]
if not label_col_candidates:
    label_col_candidates = [df.columns[-1]]

TEXT_COL = text_col_candidates[0]
LABEL_COL = label_col_candidates[0]

df = df.dropna(subset=[TEXT_COL, LABEL_COL]).copy()

display(df.head(10))

# Label distribution and message lengths
label_counts = df[LABEL_COL].value_counts().sort_values(ascending=False)
df["_char_len"] = df[TEXT_COL].astype(str).str.len()
df["_word_len"] = df[TEXT_COL].astype(str).str.split().apply(len)

summary_stats = pd.DataFrame({
    "count": [len(df)],
    "unique_labels": [df[LABEL_COL].nunique()],
    "avg_chars": [df["_char_len"].mean()],
    "median_chars": [df["_char_len"].median()],
    "avg_words": [df["_word_len"].mean()],
    "median_words": [df["_word_len"].median()],
})
display(summary_stats)
display(label_counts.rename_axis(LABEL_COL).reset_index(name="count"))

# Visualizations
plt.figure()
plt.bar(label_counts.index.astype(str), label_counts.values)
plt.title("Category Distribution")
plt.xlabel("Category")
plt.ylabel("Count")
plt.xticks(rotation=30, ha="right")
save_fig("category_distribution.png"); plt.show()

plt.figure()
plt.hist(df["_word_len"], bins=30)
plt.title("Message Word Length Distribution")
plt.xlabel("Words per message")
plt.ylabel("Frequency")
save_fig("message_word_length_hist.png"); plt.show()


## Step 2: Text Preprocessing

In [None]:

_PUNCT_TBL = str.maketrans("", "", string.punctuation)
DEFAULT_STOPWORDS = set(ENGLISH_STOP_WORDS) | {
    "app","apps","ut","urbantech","transit","route","routes","train","bus","buses"
}
TOKEN_PATTERN = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")

def naive_lemmatize(token: str) -> str:
    t = token
    if t.endswith("ies") and len(t) > 4:
        return t[:-3] + "y"
    if t.endswith(("sses","shes","ches")):
        return t[:-2]
    if t.endswith("s") and len(t) > 3 and not t.endswith("ss"):
        t = t[:-1]
    if t.endswith("ing") and len(t) > 5:
        t = t[:-3]
        if t.endswith("y"):
            t = t[:-1] + "i"
    elif t.endswith("ed") and len(t) > 4:
        t = t[:-2]
    return t

def preprocess_text(text: str, stopwords: Optional[set]=None, lemmatize: bool=True) -> str:
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = text.translate(_PUNCT_TBL)
    tokens = TOKEN_PATTERN.findall(text)
    sw = DEFAULT_STOPWORDS if stopwords is None else stopwords
    out: List[str] = []
    for tok in tokens:
        if tok in sw:
            continue
        tok2 = naive_lemmatize(tok) if lemmatize else tok
        if tok2 and tok2 not in sw and tok2.isalpha():
            out.append(tok2)
    return " ".join(out)

# Preview preprocessing
preview = pd.DataFrame({
    "original": df[TEXT_COL].astype(str).head(8).tolist(),
    "preprocessed": [preprocess_text(s) for s in df[TEXT_COL].astype(str).head(8)]
})
display(preview)


## Step 3: Feature Extraction (BoW & TF-IDF)

In [None]:

RANDOM_STATE = 42
TEST_SIZE = 0.2

X = df[TEXT_COL].astype(str).apply(preprocess_text)
y = df[LABEL_COL].astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)

def nonempty(series: pd.Series) -> pd.Series:
    return series.apply(lambda s: s if isinstance(s, str) and s.strip() else "placeholdertoken")

X_train_ne = nonempty(X_train)
X_test_ne  = nonempty(X_test)

bow_vectorizer = CountVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1),
                                 lowercase=False, token_pattern=r"(?u)\b\w+\b")
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_df=1.0, ngram_range=(1,1),
                                   lowercase=False, token_pattern=r"(?u)\b\w+\b")

X_train_bow = bow_vectorizer.fit_transform(X_train_ne)
X_test_bow  = bow_vectorizer.transform(X_test_ne)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_ne)
X_test_tfidf  = tfidf_vectorizer.transform(X_test_ne)


## Step 4: Model Building with Naive Bayes + Top Words

In [None]:

nb_bow = MultinomialNB(alpha=1.0).fit(X_train_bow, y_train)
nb_tfidf = MultinomialNB(alpha=1.0).fit(X_train_tfidf, y_train)

def top_words_per_class(vectorizer, clf: MultinomialNB, k=12) -> pd.DataFrame:
    feats = np.array(vectorizer.get_feature_names_out())
    rows = []
    for i, cls in enumerate(clf.classes_):
        lp = clf.feature_log_prob_[i]
        idx = np.argsort(lp)[-k:][::-1]
        for w, s in zip(feats[idx], lp[idx]):
            rows.append({"class": cls, "word": w, "log_prob": float(s)})
    return pd.DataFrame(rows)

display(top_words_per_class(bow_vectorizer, nb_bow, k=12).groupby("class").head(12))
display(top_words_per_class(tfidf_vectorizer, nb_tfidf, k=12).groupby("class").head(12))


## Step 5: Evaluation (reports + confusion matrices)

In [None]:

def evaluate(clf, Xte, Yte, title: str, fig_name: str):
    preds = clf.predict(Xte)
    acc = accuracy_score(Yte, preds)
    print(f"\n== {title} ==\nAccuracy: {acc:.3f}\n")
    print(classification_report(Yte, preds, digits=3))
    labels = sorted(Yte.unique())
    cm = confusion_matrix(Yte, preds, labels=labels)
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(f"Confusion Matrix - {title}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(ticks=range(len(labels)), labels=labels, rotation=30, ha="right")
    plt.yticks(ticks=range(len(labels)), labels=labels)
    for (i, j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha="center", va="center")
    save_fig(fig_name); plt.show()

evaluate(nb_bow, X_test_bow, y_test, "NB + BoW (baseline)", "cm_nb_bow_baseline.png")
evaluate(nb_tfidf, X_test_tfidf, y_test, "NB + TF-IDF (baseline)", "cm_nb_tfidf_baseline.png")


## Step 6: Model Improvement (GridSearchCV)

In [None]:

def build_pipeline(kind="bow"):
    vec = CountVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b") if kind=="bow" else TfidfVectorizer(lowercase=False, token_pattern=r"(?u)\b\w+\b")
    nb = MultinomialNB()
    return Pipeline([("vec", vec), ("nb", nb)])

param_grid = {
    "vec__min_df": [1, 2],
    "vec__max_df": [0.9, 1.0],
    "vec__ngram_range": [(1,1), (1,2)],
    "nb__alpha": [0.5, 1.0, 1.5]
}

pip_bow = build_pipeline("bow")
pip_tfidf = build_pipeline("tfidf")

gscv_bow = GridSearchCV(pip_bow, param_grid, cv=3, n_jobs=-1, scoring="f1_macro")
gscv_tfidf = GridSearchCV(pip_tfidf, param_grid, cv=3, n_jobs=-1, scoring="f1_macro")

gscv_bow.fit(X_train_ne, y_train)
gscv_tfidf.fit(X_train_ne, y_train)

best_bow = gscv_bow.best_estimator_
best_tfidf = gscv_tfidf.best_estimator_

print("Best BoW params:", gscv_bow.best_params_)
print("Best TF-IDF params:", gscv_tfidf.best_params_)

from sklearn.metrics import classification_report
best_bow_preds = best_bow.predict(X_test_ne)
best_tfidf_preds = best_tfidf.predict(X_test_ne)

rep_bow_text = classification_report(y_test, best_bow_preds, digits=3)
rep_tfidf_text = classification_report(y_test, best_tfidf_preds, digits=3)

print("\nBest BoW Classification Report:\n", rep_bow_text)
print("\nBest TF-IDF Classification Report:\n", rep_tfidf_text)

# Choose best by macro-F1 parsed from the text table
def macro_f1(report_text: str) -> float:
    lines = [ln for ln in report_text.splitlines() if "macro avg" in ln]
    if not lines: return 0.0
    parts = [p for p in lines[0].split(" ") if p.strip()]
    try:
        return float(parts[-2])
    except:
        return 0.0

f1_bow = macro_f1(rep_bow_text)
f1_tfidf = macro_f1(rep_tfidf_text)

chosen = "TF-IDF" if f1_tfidf >= f1_bow else "BoW"
best_pipeline = best_tfidf if chosen=="TF-IDF" else best_bow
print(f"\nChosen best model: {chosen}")


## Step 7: Prediction Wrapper & Artifacts

In [None]:

class UrbanTechFeedbackClassifier:
    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline
    def preprocess(self, texts: List[str]) -> List[str]:
        return [preprocess_text(t) for t in texts]
    def predict(self, texts: List[str]):
        clean = self.preprocess(texts)
        clean_ne = [c if c.strip() else "placeholdertoken" for c in clean]
        probs = self.pipeline.predict_proba(clean_ne)
        preds = self.pipeline.predict(clean_ne)
        out = []
        for t, p, pr in zip(texts, probs, preds):
            out.append({
                "text": t,
                "predicted_label": pr,
                "confidence": float(np.max(p)),
                "probabilities": {cls: float(prob) for cls, prob in zip(self.pipeline.named_steps["nb"].classes_, p)}
            })
        return out

wrapper = UrbanTechFeedbackClassifier(best_pipeline)

examples = [
    "The map sent me to the wrong platform and the transfer time was off.",
    "The app keeps freezing when I try to buy a ticket.",
    "The train delay information was outdated by 20 minutes."
]
pd.DataFrame(wrapper.predict(examples))


In [None]:

# Save artifacts
joblib.dump(best_pipeline, "/mnt/data/urbantech_best_pipeline.joblib")
with open("/mnt/data/urbantech_metadata.json", "w") as f:
    json.dump({
        "best_model_type": chosen,
        "best_params": best_pipeline.get_params()
    }, f, indent=2)

print("Saved: /mnt/data/urbantech_best_pipeline.joblib")
print("Saved: /mnt/data/urbantech_metadata.json")
