In [2]:
import pandas as pd
import json
import re
from sklearn.model_selection import train_test_split
from pathlib import Path
import subprocess

In [None]:
# 1. Load raw JSON lines
data = []
raw_data_path = Path("../data/raw/News_Category_Dataset_v3.json")
if not raw_data_path.exists():
    print("Downloading the dataset...")
    # Use subprocess.run to execute the curl command
    result = subprocess.run(
        [
            "curl",
            "-L",
            "-o",
            "../data/raw/news-category-dataset.zip",  # Corrected output path
            "https://www.kaggle.com/api/v1/datasets/download/rmisra/news-category-dataset",
        ],
        check=True,  # Raise an exception if the command fails
        capture_output=True,  # Capture stdout and stderr
        text=True,  # Ensure output is treated as text
    )
    # Check for errors during download
    if result.returncode != 0:
        print(f"Error downloading the dataset: {result.stderr}")
        exit(1)  # Exit if download failed

    # Unzip the dataset
    import zipfile
    with zipfile.ZipFile("../data/raw/news-category-dataset.zip", 'r') as zip_ref:
        zip_ref.extractall("../data/raw")
    print("Dataset downloaded and extracted.")
else:
    print("Dataset already exists.")

with open("../data/raw/News_Category_Dataset_v3.json", "r") as f:
    for line in f:
        data.append(json.loads(line))

# 2. Convert to DataFrame
df = pd.DataFrame(data)
df = df[["headline", "category", "short_description"]].dropna()

# 3. Clean the text
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

df["clean_headline"] = df["headline"].apply(clean_text)

In [None]:
# 4. Stratified split (preserving label distribution)
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["category"],
    random_state=42
)

# 5. Save to CSV
Path("../data/processed").mkdir(parents=True, exist_ok=True)
train_df.to_csv("../data/processed/train.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)

print("âœ… Data ingestion and cleaning completed.")