In [1]:
import json

In [2]:
documents = []
with open("dataset_news.json", "r", encoding="utf-8") as f:
    for line in f:
        documents.append(json.loads(line))

In [3]:
print(len(documents))

209527


In [4]:
print(documents[0])

{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': '2022-09-23'}


In [6]:
print(documents[1]['headline'])

American Airlines Flyer Charged, Banned For Life After Punching Flight Attendant On Video


In [7]:
imp_documents = []

for i, doc in enumerate(documents):
    headline = doc.get("headline", "").strip()
    description = doc.get("short_description", "").strip()

    if not headline and not description:
        continue  # skip empty docs

    text = headline + "." + description

    imp_documents.append({
        "id": i,
        "category": doc.get("category", "UNKNOWN"),
        "text": text
    })

print("Usable documents:", len(imp_documents))

Usable documents: 209522


In [8]:
print(imp_documents[0])

{'id': 0, 'category': 'U.S. NEWS', 'text': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters.Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.'}


In [9]:
import pandas as pd
df = pd.DataFrame(imp_documents)

In [10]:
df = df[['id', 'category', 'text']]

In [11]:
df.head()

Unnamed: 0,id,category,text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [12]:
print(df['text'][0])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters.Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.


In [13]:
print(df.isnull().sum())

id          0
category    0
text        0
dtype: int64


In [80]:
import re

def join_acronyms(text):
    # Join only single capital letters separated by space
    # U S → US, E U → EU
    return re.sub(r'\b([A-Z])\s+([A-Z])\b', r'\1\2', text)


In [85]:
import re
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = join_acronyms(text) 
    text = text.lower()
    # text = normalize_abbreviations(text)
    # text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()
    
    
    return text

In [86]:
df['clean_text'] = df['text'].apply(clean_text)
df['clean_text'].head()

0    over million americans roll up sleeves for omi...
1    american airlines flyer charged banned for lif...
2    of the funniest tweets about cats and dogs thi...
3    the funniest tweets from parents this week sep...
4    woman who called cops on black bird watcher lo...
Name: clean_text, dtype: object

In [88]:
print(df['clean_text'][0])

over million americans roll up sleeves for omicron targeted covid boosters health experts said it is too early to predict whether demand would match up with the million doses of the new boosters the u s ordered for the fall


In [17]:
CONTRACTIONS = {
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "it's": "it is",
    "i'm": "i am",
    "they're": "they are",
    "we're": "we are",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not"
}
def expand_contractions(text):
    new_text = []
    for w in text.split():
        if w in CONTRACTIONS:
            new_text.append(CONTRACTIONS[w])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [75]:
df['expanded_text'] = df['clean_text'].apply(expand_contractions)

In [76]:
df['expanded_text'].head()

0    over million americans roll up sleeves for omi...
1    american airlines flyer charged banned for lif...
2    of the funniest tweets about cats and dogs thi...
3    the funniest tweets from parents this week sep...
4    woman who called cops on black bird watcher lo...
Name: expanded_text, dtype: object

In [78]:
df['expanded_text'] = df['expanded_text'].apply(normalize_abbreviations)

In [79]:
print(df['expanded_text'][0])

over million americans roll up sleeves for omicron targeted covid boosters health experts said it is too early to predict whether demand would match up with the million doses of the new boosters theus ordered for the fall


In [77]:
df.head()

Unnamed: 0,id,category,text,clean_text,expanded_text,final_text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll up sleeves for omi...,over million americans roll up sleeves for omi...,million americans roll sleeves omicron targete...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned for lif...,american airlines flyer charged banned for lif...,american airlines flyer charged banned life pu...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,of the funniest tweets about cats and dogs thi...,of the funniest tweets about cats and dogs thi...,funniest tweets cats dogs week sept dog unders...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,the funniest tweets from parents this week sep...,the funniest tweets from parents this week sep...,funniest tweets parents week sept accidentally...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,woman who called cops on black bird watcher lo...,woman who called cops on black bird watcher lo...,woman called cops black bird watcher loses law...


In [33]:
!pip install textblob


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 

In [34]:
from textblob import TextBlob

In [35]:
def correct_spelling(text):
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Input must be a non-empty string.")
    
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

In [25]:
%pip install symspellpy


Collecting symspellpy
  Downloading symspellpy-6.9.0-py3-none-any.whl.metadata (3.9 kB)
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.6-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Downloading symspellpy-6.9.0-py3-none-any.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------------------------------- 0


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from symspellpy.symspellpy import SymSpell, Verbosity

# Initialize
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load dictionary
sym_spell.load_dictionary(
    "frequency_dictionary_en_82_765.txt",
    term_index=0,
    count_index=1
)

True

In [29]:
df.head()

Unnamed: 0,id,category,text,clean_text,expanded_text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll up sleeves for omi...,over million americans roll up sleeves for omi...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned for lif...,american airlines flyer charged banned for lif...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,of the funniest tweets about cats and dogs thi...,of the funniest tweets about cats and dogs thi...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,the funniest tweets from parents this week sep...,the funniest tweets from parents this week sep...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,woman who called cops on black bird watcher lo...,woman who called cops on black bird watcher lo...


In [35]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Harsh
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Harsh
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [36]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopwords = set(stopwords.words('english'))
def remove_stopwords(text):
    if not isinstance(text, str):
        return ""
    
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stopwords]
    return " ".join(filtered_tokens)


In [37]:
df['final_text'] = df['expanded_text'].apply(remove_stopwords)

In [38]:
df.head()

Unnamed: 0,id,category,text,clean_text,expanded_text,final_text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll up sleeves for omi...,over million americans roll up sleeves for omi...,million americans roll sleeves omicron targete...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned for lif...,american airlines flyer charged banned for lif...,american airlines flyer charged banned life pu...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,of the funniest tweets about cats and dogs thi...,of the funniest tweets about cats and dogs thi...,funniest tweets cats dogs week sept dog unders...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,the funniest tweets from parents this week sep...,the funniest tweets from parents this week sep...,funniest tweets parents week sept accidentally...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,woman who called cops on black bird watcher lo...,woman who called cops on black bird watcher lo...,woman called cops black bird watcher loses law...


In [39]:
print(df['final_text'][0])

million americans roll sleeves omicron targeted covid boosters health experts said early predict whether demand would match million doses new boosters u ordered fall


In [89]:
%pip install spacy

Collecting spacy
  Downloading spacy-3.8.11-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.15-cp311-cp311-win_amd64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.13-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.12-cp311-cp311-win_amd64.whl.metadata (2.6 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.10-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.2-cp311-cp311-win_am


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [94]:
%pip install -U spacy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [95]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     - ------------------------------------- 0.5/12.8 MB 929.6 kB/s eta 0:00:14
     - ------------------------------------- 0.5/12.8 MB 929.6 kB/s eta 0:00:14
     -- ------------------------------------ 0.8/12.8 MB 684.4 kB/s eta 0:00:18
     --- ----------------------------------- 1.0/12.8 MB 751.1 kB/s eta 0:00:16
     --- ----------------------------------- 1.0/12.8 MB 751.1 kB/s eta 0:00:16
     --- ----------------------------------- 1.3/12.8 MB 762.6 kB/s eta 0:00:16
     ---- ---------------------------------- 1.6/12.8 MB


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [102]:
ACRONYM_PATTERN = re.compile(r'\b(?:[A-Z]\.){2,}')

def normalize_acronyms(text):
    def replacer(match):
        return match.group(0).replace(".", "")
    return ACRONYM_PATTERN.sub(replacer, text)


In [96]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

This below step is the main text_preprocessing step

In [103]:
def advanced_clean(text):
    # Step 1: normalize acronyms (U.S. -> US)
    text = normalize_acronyms(text)

    doc = nlp(text)

    tokens = []
    for token in doc:
        # keep proper nouns (acronyms, orgs, countries)
        if token.pos_ == "PROPN":
            tokens.append(token.text.lower())
            continue

        if token.is_stop or token.is_punct or token.like_num:
            continue

        tokens.append(token.lemma_.lower())

    return " ".join(tokens)


In [104]:
df['advanced_cleaned_text'] = df['text'].apply(advanced_clean)

In [106]:
print(df['advanced_cleaned_text'][0])

americans roll sleeves omicron target covid booster health expert say early predict demand match dos new booster us order fall


In [107]:
df.head()

Unnamed: 0,id,category,text,clean_text,expanded_text,final_text,advanced_cleaned_text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll up sleeves for omi...,over million americans roll up sleeves for omi...,million americans roll sleeves omicron targete...,americans roll sleeves omicron target covid bo...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned for lif...,american airlines flyer charged banned for lif...,american airlines flyer charged banned life pu...,american airlines flyer charged ban life punch...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,of the funniest tweets about cats and dogs thi...,of the funniest tweets about cats and dogs thi...,funniest tweets cats dogs week sept dog unders...,"funniest tweets cats dogs week sept. 23).""unti..."
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,the funniest tweets from parents this week sep...,the funniest tweets from parents this week sep...,funniest tweets parents week sept accidentally...,"funniest tweets parent week sept. 23).""acciden..."
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,woman who called cops on black bird watcher lo...,woman who called cops on black bird watcher lo...,woman called cops black bird watcher loses law...,woman call cops black bird watcher lose lawsui...


In [108]:
print(df['text'][0])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters.Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.


NOW, Let's move to Text Representation or Vectorization

In [111]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

1. BAG OF WORDS


In [113]:
bow = CountVectorizer()
x_bow = bow.fit_transform(df['advanced_cleaned_text'])


In [119]:
query = "new booster us order fall"
query_vec = bow.transform([query])

scores = cosine_similarity(x_bow, query_vec)
print(len(scores))

209522


2. TF-IDF APPROACH


In [120]:
tfidf = TfidfVectorizer(
    ngram_range=(1,1),
    max_df=0.9,
    min_df=2
)

x_tfidf = tfidf.fit_transform(df['advanced_cleaned_text'])

In [124]:
query = "covid booster demand us"
query_vec = tfidf.transform([query])

scores = cosine_similarity(x_tfidf, query_vec)[0]
print((scores))

[0.60166815]


3. WORD2VEC - SKIP-GRAMS

In [131]:
sentences = [doc.split() for doc in df["advanced_cleaned_text"]]

In [134]:
%pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Downloading gensim-4.4.0-cp311-cp311-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   -------


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip



   ------------------- -------------------- 11.8/24.4 MB 114.3 kB/s eta 0:01:51
   ------------------- -------------------- 11.8/24.4 MB 114.3 kB/s eta 0:01:51
   ------------------- -------------------- 11.8/24.4 MB 114.3 kB/s eta 0:01:51
   ------------------- -------------------- 11.8/24.4 MB 114.3 kB/s eta 0:01:51
   ------------------- -------------------- 11.8/24.4 MB 114.3 kB/s eta 0:01:51
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- -------------------- 12.1/24.4 MB 114.2 kB/s eta 0:01:49
   ------------------- ----------------

In [135]:
from gensim.models import Word2Vec

In [136]:
w2v = Word2Vec(
    sentences=sentences,
    vector_size = 100,
    window = 5,
    min_count = 5,
    workers = 4,
    sg = 1,
    epochs = 10
)

In [137]:
import numpy as np

def document_vector(doc):
    vectors = [
        w2v.wv[word]
        for word in doc
        if word in w2v.wv
    ]
    if len(vectors) == 0:
        return np.zeros(w2v.vector_size)
    return np.mean(vectors, axis=0)

In [138]:
doc_vectors = np.vstack(
    [document_vector(doc) for doc in sentences]
)

In [140]:
query = "covid booster demand us"
query_tokens = query.split()

query_vec = document_vector(query_tokens).reshape(1, -1)

scores_w2v = cosine_similarity(query_vec, doc_vectors)[0]
print((scores_w2v))

[0.85604412 0.63243466 0.42707749 ... 0.46951653 0.54719707 0.58668082]


In [141]:
top_k = 5
top_idx = scores_w2v.argsort()[-top_k:][::-1]

df.iloc[top_idx][['advanced_cleaned_text']]


Unnamed: 0,advanced_cleaned_text
0,americans roll sleeves omicron target covid bo...
1497,fauci say want optimally protect covid-19 boos...
2036,chief urges halt covid-19 booster shot rest ye...
935,fda authorizes second covid booster old corona...
2770,biden support waive patents covid-19 vaccine v...
