In [9]:
import pandas as pd
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from textblob import TextBlob
from tqdm import tqdm

# Setup
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Load your scraped dataset
df = pd.read_excel("indian_health_bias_combined_clean.xlsx")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Drop duplicates and nulls
df.drop_duplicates(subset='text', inplace=True)
df.dropna(subset=['text'], inplace=True)

# Function: Clean Text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML
    text = re.sub(r'\[[^\]]*\]', '', text)  # remove brackets
    text = re.sub(r'https?://\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove non-letters
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)

In [11]:
# Add basic NLP features
df["word_count"] = df["clean_text"].apply(lambda x: len(str(x).split()))
df["char_count"] = df["clean_text"].apply(lambda x: len(str(x)))
df["sentiment"] = df["clean_text"].apply(lambda x: TextBlob(x).sentiment.polarity)

# Named Entities (useful for bias tracking)
def extract_entities(text):
    doc = nlp(text)
    return [ent.label_ + ": " + ent.text for ent in doc.ents]

tqdm.pandas()
df["entities"] = df["text"].progress_apply(extract_entities)

100%|██████████| 3610/3610 [14:30<00:00,  4.15it/s]


In [12]:
# Save the processed data
df.to_csv("preprocessed_health_news_bias.csv", index=False)
print("✅ Preprocessing complete. Saved to 'preprocessed_health_news_bias.csv'")


✅ Preprocessing complete. Saved to 'preprocessed_health_news_bias.csv'
