In [2]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_articles(urls):
    articles = []

    for url in urls:
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            paragraphs = soup.find_all("p")
            content = ' '.join([p.get_text() for p in paragraphs if p.get_text()])
            if len(content) > 200:
                articles.append(content)
        except Exception as e:
            print(f"Failed to extract {url}: {e}")

    return pd.DataFrame({"article": articles})

In [4]:
import pandas as pd
import re

def clean_text(text):
    if not isinstance(text, str):
        return ""

    # Remove non-ASCII characters
    text = text.encode("ascii", errors="ignore").decode()
    # Remove HTML tags/entities if any
    text = re.sub(r'<.*?>', ' ', text)                      
    text = re.sub(r'&[a-z]+;', ' ', text)              

    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove unwanted email addresses and phone numbers
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', '', text)

    #  keep some punctuation but remove special characters
    text = re.sub(r'[^a-zA-Z0-9.,!?\'\" ]+', '', text)
    text = re.sub(r'([!?.,]){2,}', r'\1', text)

    # Replace multiple spaces with one
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def transform_articles(df):
    # Drop any rows with null
    df = df.dropna(subset=["article"])
    df['article'] = df['article'].apply(clean_text)

    df = df.drop_duplicates(subset=["article"])

    df = df[df['article'].str.len() > 100]

    return df


In [5]:
def load_to_csv(df, output_path):
    df.to_csv(output_path, index=False)
    print(f"Saved cleaned data to {output_path}")


In [7]:
def run_pipeline():
    urls = [
       "https://www.npr.org/2024/12/12/1226579839/winter-storms-flights-delays",
       "https://www.npr.org/2024/11/05/1223029935/election-day-2024-live-updates",
       "https://www.bbc.com/news/world-68920595", 
       "https://www.bbc.com/news/business-68911184",
       "https://edition.cnn.com/2024/12/28/politics/biden-ukraine-congress/index.html",
       "https://edition.cnn.com/2025/01/03/health/new-weight-loss-drug-approval/index.html"
    ]
    
    raw_df = extract_articles(urls)
    cleaned_df = transform_articles(raw_df)
    load_to_csv(cleaned_df, "cleaned_articles.csv")

In [8]:
run_pipeline()

Saved cleaned data to cleaned_articles.csv
