In [4]:
import pandas as pd
import json
import re
from sklearn.model_selection import train_test_split
from pathlib import Path

In [12]:
# 1. Load raw JSON lines
data = []
with open("../data/raw/News_Category_Dataset_v3.json", "r") as f:
    for line in f:
        data.append(json.loads(line))

# 2. Convert to DataFrame
df = pd.DataFrame(data)
df = df[["headline", "category", "short_description"]].dropna()

# 3. Clean the text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_headline"] = df["headline"].apply(clean_text)

In [13]:
# 4. Stratified split (preserving label distribution)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["category"],
    random_state=42
)

# 5. Save to CSV
Path("../data/processed").mkdir(parents=True, exist_ok=True)
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("✅ Data ingestion and cleaning completed.")

✅ Data ingestion and cleaning completed.
