In [1]:
import json
import re
import os


Load Raw News Data

In [2]:
with open("../../data/raw/news.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

articles = raw_data["articles"]
len(articles)


19

Define Text Cleaning Function

In [3]:
def clean_text(text):
    if text is None:
        return ""

    text = text.lower()                       # lowercase
    text = re.sub(r"http\S+", "", text)       # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove symbols
    text = re.sub(r"\s+", " ", text)          # remove extra spaces

    return text.strip()


Clean All Articles

In [4]:
cleaned_articles = []

for article in articles:
    cleaned_articles.append({
        "title": clean_text(article.get("title")),
        "description": clean_text(article.get("description")),
        "content": clean_text(article.get("content")),
        "publishedAt": article.get("publishedAt"),
        "source": article.get("source", {}).get("name")
    })

len(cleaned_articles)


19

Save Cleaned Data

In [5]:
os.makedirs("../../data/processed", exist_ok=True)

with open("../../data/processed/cleaned_news.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_articles, f, indent=2)

print("✅ Cleaned news data saved to data/processed/cleaned_news.json")


✅ Cleaned news data saved to data/processed/cleaned_news.json


Verify Cleaning

In [6]:
cleaned_articles[0]


{'title': 'us futures sink after trump warns of higher tariffs for 8 countries over greenland issue',
 'description': 'us stock futures skidded monday after us president donald trump threatened to slap a 10 extra tariff on imports from eight european countries due to',
 'content': 'bangkok ap us stock futures skidded monday after us president donald trump threatened to slap a 10 extra tariff on imports from eight european countries due to their opposition to his desire t 3982 chars',
 'publishedAt': '2026-01-19T06:42:19Z',
 'source': 'Yahoo Entertainment'}