In [None]:
!git clone https://github.com/lekshmi-j/topic-identification-nlp.git


fatal: destination path 'topic-identification-nlp' already exists and is not an empty directory.


In [None]:
%cd topic-identification-nlp



/content/topic-identification-nlp


In [None]:
!pip install -r requirements.txt




In [None]:
import pandas as pd

df = pd.read_csv("data/processed/20newsgroups_processed.csv")

X = df["clean_text"]
y = df["topic"]

print(X.shape)
print("Number of classes:", y.nunique())


(238817,)
Number of classes: 21


In [None]:
y.value_counts()


Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
talk.politics.mideast.txt,17578
alt.atheism.txt,17564
comp.graphics.txt,16854
soc.religion.christian.txt,14556
sci.crypt.txt,14386
sci.space.txt,13102
talk.politics.guns.txt,13062
comp.windows.x.txt,12746
sci.med.txt,11688
rec.sport.hockey.txt,11650


In [None]:
min_samples = 2  # minimum required for stratification

valid_classes = y.value_counts()[y.value_counts() >= min_samples].index

df_filtered = df[df["topic"].isin(valid_classes)]

X = df_filtered["clean_text"]
y = df_filtered["topic"]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
X.isna().sum()


np.int64(2195)

In [None]:
df_filtered = df_filtered.dropna(subset=["clean_text"])


In [None]:
df_filtered = df_filtered[df_filtered["clean_text"].str.strip() != ""]


In [None]:
X = df_filtered["clean_text"]
y = df_filtered["topic"]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_df=0.5,
    min_df=10,
    ngram_range=(1, 2)
)

X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

X_train_vec.shape


(189296, 45224)

In [None]:
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)


Data Cleaning Note:
Some documents became empty after preprocessing and were removed
before TF-IDF vectorization to avoid invalid inputs.
TF-IDF cannot handle missing text

Preprocessing can destroy documents

Data validation is part of ML, not an afterthought

In [None]:
#Sanity check
print("Any NaNs left?", X_train.isna().any(), X_test.isna().any())


Any NaNs left? False False


LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

lr.fit(X_train_vec, y_train)

y_pred_lr = lr.predict(X_test_vec)


NAIVE BAYES

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)


EVALUATION


In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


CONFUSION MATRIX

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

labels = y.unique()
cm = confusion_matrix(y_test, y_pred_lr, labels=labels)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    xticklabels=labels,
    yticklabels=labels,
    cmap="Blues"
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix — Logistic Regression")
plt.show()


Difference Between Topic Modeling and Topic Classification:

Topic Modeling:
- Unsupervised
- Discovers latent topics
- No labeled data
- Used for exploration

Topic Classification:
- Supervised
- Predicts predefined labels
- Requires labeled data
- Used for automation

When to Use Each:
- Topic modeling → unknown structure
- Topic classification → known categories


In [None]:
from src.classify import (
    build_tfidf,
    train_logistic_regression,
    train_naive_bayes,
    predict_topic
)

from src.evaluate import evaluate_classifier


In [None]:
tfidf = build_tfidf()
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

lr = train_logistic_regression(X_train_vec, y_train)
nb = train_naive_bayes(X_train_vec, y_train)

y_pred_lr = lr.predict(X_test_vec)
y_pred_nb = nb.predict(X_test_vec)

acc_lr, report_lr = evaluate_classifier(y_test, y_pred_lr)
acc_nb, report_nb = evaluate_classifier(y_test, y_pred_nb)

print("Logistic Regression Accuracy:", acc_lr)
print(report_lr)

print("Naive Bayes Accuracy:", acc_nb)
print(report_nb)


Logistic Regression Accuracy: 0.8095087163232964
                              precision    recall  f1-score   support

             alt.atheism.txt       0.76      0.87      0.81      3469
           comp.graphics.txt       0.74      0.85      0.79      3286
 comp.os.ms-windows.misc.txt       0.82      0.72      0.77      1859
comp.sys.ibm.pc.hardware.txt       0.79      0.71      0.75      1917
   comp.sys.mac.hardware.txt       0.84      0.73      0.78      1615
          comp.windows.x.txt       0.78      0.82      0.80      2531
            misc.forsale.txt       0.78      0.79      0.78      1763
               rec.autos.txt       0.86      0.81      0.83      1876
         rec.motorcycles.txt       0.85      0.81      0.83      1935
      rec.sport.baseball.txt       0.90      0.86      0.88      2077
        rec.sport.hockey.txt       0.88      0.90      0.89      2280
               sci.crypt.txt       0.84      0.84      0.84      2858
         sci.electronics.txt       0.82 

INFERENCE

In [None]:
from src.classify import predict_topic

sample_texts = [
    "NASA successfully launched a new satellite into space",
    "The car engine performance has improved significantly",
    "The government announced new gun control policies"
]

predictions = predict_topic(sample_texts, tfidf, lr)

for text, pred in zip(sample_texts, predictions):
    print(f"Text: {text}")
    print(f"Predicted topic: {pred}\n")


Text: NASA successfully launched a new satellite into space
Predicted topic: sci.space.txt

Text: The car engine performance has improved significantly
Predicted topic: rec.autos.txt

Text: The government announced new gun control policies
Predicted topic: talk.politics.guns.txt



Concepts Learned:
- Separation of experimentation and production code
- Writing reusable ML pipelines
- Importance of clean interfaces and docstrings
- Maintainable ML systems scale better than notebooks
