In [1]:
!git clone https://github.com/lekshmi-j/topic-identification-nlp.git


fatal: destination path 'topic-identification-nlp' already exists and is not an empty directory.


In [2]:
%cd topic-identification-nlp



/content/topic-identification-nlp


In [3]:
!pwd


/content/topic-identification-nlp


In [4]:
!pip install -r requirements.txt


Collecting gensim (from -r requirements.txt (line 5))
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [5]:
import pandas as pd

df = pd.read_csv("data/raw/20newsgroups.csv")
df.head()


Unnamed: 0,text,topic
0,Newsgroup: rec.autos\ndocument_id: 101551\nFro...,rec.autos.txt
1,"In article <C4vIr5.L3r@shuksan.ds.boeing.com>,...",rec.autos.txt
2,"Say, you bought your Saturn at $13k, with a de...",rec.autos.txt
3,"Moreover, if Saturn really does reduce the dea...",rec.autos.txt
4,1) Attract even more people to buy Saturns bec...,rec.autos.txt


In [6]:
print(df.shape)
print(df.columns)


(238817, 2)
Index(['text', 'topic'], dtype='object')


In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def basic_preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)

    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return tokens


In [9]:
df["tokens"] = df["text"].apply(basic_preprocess)
df[["text", "tokens"]].head()


Unnamed: 0,text,tokens
0,Newsgroup: rec.autos\ndocument_id: 101551\nFro...,"[newsgroup, rec, auto, document, brown, edu, h..."
1,"In article <C4vIr5.L3r@shuksan.ds.boeing.com>,...","[article, vir, shuksan, boeing, com, fredd, sh..."
2,"Say, you bought your Saturn at $13k, with a de...","[say, bought, saturn, dealer, profit, dealer, ..."
3,"Moreover, if Saturn really does reduce the dea...","[moreover, saturn, really, reduce, dealer, pro..."
4,1) Attract even more people to buy Saturns bec...,"[attract, even, people, buy, saturn, would, sa..."


In [10]:
!pip install gensim




In [11]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

sentences = df["tokens"].tolist()

bigram_model = Phrases(
    sentences,
    min_count=10,
    threshold=15
)

bigram_phraser = Phraser(bigram_model)

df["tokens"] = df["tokens"].apply(lambda x: bigram_phraser[x])


In [12]:
df["tokens"].head()


Unnamed: 0,tokens
0,"[newsgroup_rec, auto_document, brown, edu, hok..."
1,"[article, vir, shuksan, boeing_com, fredd, shu..."
2,"[say, bought, saturn, dealer_profit, dealer_pr..."
3,"[moreover, saturn, really, reduce, dealer_prof..."
4,"[attract, even, people, buy, saturn, would, sa..."


In [13]:
from collections import Counter

all_tokens = [token for doc in df["tokens"] for token in doc]
freq = Counter(all_tokens)


In [14]:
MIN_FREQ = 20        # remove very rare words
MAX_DOC_RATIO = 0.5 # remove overly common words


In [15]:
num_docs = len(df)

def filter_tokens(tokens):
    return [
        t for t in tokens
        if freq[t] >= MIN_FREQ
    ]

df["tokens"] = df["tokens"].apply(filter_tokens)


In [16]:
MAX_VOCAB_SIZE = 8000

most_common = set(
    [word for word, _ in freq.most_common(MAX_VOCAB_SIZE)]
)

df["tokens"] = df["tokens"].apply(
    lambda tokens: [t for t in tokens if t in most_common]
)


In [17]:
df["clean_text"] = df["tokens"].apply(lambda x: " ".join(x))


In [18]:
df[["clean_text", "topic"]].to_csv(
    "/content/20newsgroups_processed.csv",
    index=False
)


How Preprocessing Affects Topic Quality:
- Aggressive cleaning removes noise but may remove topic signals
- N-grams improve semantic coherence
- Frequency filtering stabilizes topic-word distributions

Trade-off Between Noise and Over-cleaning:
- Under-cleaning leads to noisy topics
- Over-cleaning removes meaningful phrases
- Optimal preprocessing is task-dependent


In [20]:
git add notebooks/02_preprocessing.ipynb src/preprocess.py data/processed/

SyntaxError: invalid decimal literal (ipython-input-3952755116.py, line 1)