In [1]:
import json

In [3]:
documents = []
with open("dataset_news.json", "r", encoding="utf-8") as f:
    for line in f:
        documents.append(json.loads(line))

In [7]:
print(len(documents))

209527


In [8]:
print(documents[0])

{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9', 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters', 'category': 'U.S. NEWS', 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.', 'authors': 'Carla K. Johnson, AP', 'date': '2022-09-23'}


In [9]:
print(documents[0]['headline'])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters


In [12]:
imp_documents = []

for i, doc in enumerate(documents):
    headline = doc.get("headline", "").strip()
    description = doc.get("short_description", "").strip()

    if not headline and not description:
        continue  # skip empty docs

    text = headline + "." + description

    imp_documents.append({
        "id": i,
        "category": doc.get("category", "UNKNOWN"),
        "text": text
    })

print("Usable documents:", len(imp_documents))

Usable documents: 209522


In [13]:
print(imp_documents[0])

{'id': 0, 'category': 'U.S. NEWS', 'text': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters.Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.'}


In [14]:
import pandas as pd
df = pd.DataFrame(imp_documents)

In [16]:
df = df[['id', 'category', 'text']]

In [17]:
df.head()

Unnamed: 0,id,category,text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [18]:
print(df['text'][0])

Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters.Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.


In [19]:
print(df.isnull().sum())

id          0
category    0
text        0
dtype: int64


In [24]:
import re
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^a-z]', ' ', text)  # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [25]:
df['clean_text'] = df['text'].apply(clean_text)
df['clean_text'].head()

0    over million americans roll up sleeves for omi...
1    american airlines flyer charged banned for lif...
2    of the funniest tweets about cats and dogs thi...
3    the funniest tweets from parents this week sep...
4    woman who called cops on black bird watcher lo...
Name: clean_text, dtype: object

In [26]:
print(df['clean_text'][0])

over million americans roll up sleeves for omicron targeted covid boosters health experts said it is too early to predict whether demand would match up with the million doses of the new boosters the u s ordered for the fall


In [27]:
CONTRACTIONS = {
    "can't": "cannot",
    "won't": "will not",
    "don't": "do not",
    "it's": "it is",
    "i'm": "i am",
    "they're": "they are",
    "we're": "we are",
    "isn't": "is not",
    "aren't": "are not",
    "wasn't": "was not",
    "weren't": "were not"
}
def expand_contractions(text):
    new_text = []
    for w in text.split():
        if w in CONTRACTIONS:
            new_text.append(CONTRACTIONS[w])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [29]:
df['expanded_text'] = df['clean_text'].apply(expand_contractions)

In [30]:
df['expanded_text'].head()

0    over million americans roll up sleeves for omi...
1    american airlines flyer charged banned for lif...
2    of the funniest tweets about cats and dogs thi...
3    the funniest tweets from parents this week sep...
4    woman who called cops on black bird watcher lo...
Name: expanded_text, dtype: object

In [31]:
df.head()

Unnamed: 0,id,category,text,clean_text,expanded_text
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,over million americans roll up sleeves for omi...,over million americans roll up sleeves for omi...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",american airlines flyer charged banned for lif...,american airlines flyer charged banned for lif...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,of the funniest tweets about cats and dogs thi...,of the funniest tweets about cats and dogs thi...
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,the funniest tweets from parents this week sep...,the funniest tweets from parents this week sep...
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,woman who called cops on black bird watcher lo...,woman who called cops on black bird watcher lo...


In [33]:
!pip install textblob


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------------------------------- 0.0/624.3 

In [34]:
from textblob import TextBlob

In [35]:
def correct_spelling(text):
    if not isinstance(text, str) or not text.strip():
        raise ValueError("Input must be a non-empty string.")
    
    blob = TextBlob(text)
    corrected_text = str(blob.correct())
    return corrected_text

In [None]:
!pip install symspell
