# Email Classification — Notebook

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import re, string

# Load data
df = pd.read_csv("../data/support_emails.csv")
df.head()


In [None]:
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

df["text"] = (df["subject"].astype(str) + " " + df["body"].astype(str)).apply(clean_text)

def label_email(subject):
    subject = str(subject).lower()
    if "billing" in subject or "payment" in subject or "charged" in subject:
        return "Billing"
    elif "login" in subject or "account" in subject or "access" in subject or "password" in subject:
        return "Login/Account"
    elif "api" in subject or "integration" in subject:
        return "API/Integration"
    else:
        return "General"

df["label"] = df["subject"].apply(label_email)
df.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=2000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))


In [None]:
# Confusion matrix with matplotlib only
labels = sorted(df["label"].unique())
cm = confusion_matrix(y_test, y_pred, labels=labels)

fig = plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45, ha="right")
plt.yticks(tick_marks, labels)
plt.ylabel("Actual")
plt.xlabel("Predicted")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'), ha="center", va="center")
plt.tight_layout()
plt.show()


In [None]:
# Try a sample prediction
sample_subject = "Urgent: Payment not going through"
sample_body = "Hello, my credit card was charged twice."
sample_text = clean_text(sample_subject + " " + sample_body)
pred = model.predict(vectorizer.transform([sample_text]))[0]
print("Predicted Category:", pred)
