In [32]:
import os
import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter

In [33]:
RAW_DATA_PATH = "../data/raw/"
os.makedirs(RAW_DATA_PATH, exist_ok=True)

In [34]:
datasets = [
    {"handle": "programmerrdai/nasdaq-100-ndx-historical-data-daily-prices", "name": "nasdaq_history.csv", "file": "nasdaq_100_historical_data.csv"},
    {"handle": "jacksoncrow/stock-market-dataset", "name": "stock_market_full.csv", "file": "stocks/AAPL.csv"}, # Example: Loading AAPL as a benchmark
    {"handle": "miguelaenlle/parsed-sec-10q-filings-since-2006", "name": "sec_10q_parsed.csv", "file": "data.csv"},
    {"handle": "mann14/global-ai-and-data-science-job-market-20202026", "name": "global_salary_benchmarks.csv", "file": "ai_job_market_2024.csv"},
    {"handle": "amar5693/india-job-market-and-salary-trends-2026", "name": "india_salary_benchmarks.csv", "file": "india_job_market.csv"},
    {"handle": "verracodeguacas/sec-edgar-filings-index", "name": "sec_filings_index.csv", "file": "index.csv"},
    {"handle": "yanmaksi/big-startup-secsees-fail-dataset-from-crunchbase", "name": "crunchbase_startups.csv", "file": "big_startup_secsees_fail_dataset.csv"}
]

In [35]:
def master_ingest():
    print(f"--- Starting Master Ingestion to {RAW_DATA_PATH} ---\n")
    
    for ds in datasets:
        try:
            print(f"Action: Downloading {ds['handle']}...")
            
            # Use the newer dataset_load and specify the actual file name
            df_tmp = kagglehub.dataset_load(
                KaggleDatasetAdapter.PANDAS,
                ds['handle'],
                ds['file'] # Explicitly naming the file fixes the extension error
            )
            
            save_file = os.path.join(RAW_DATA_PATH, ds['name'])
            df_tmp.to_csv(save_file, index=False)
            
            print(f"Status: Saved {ds['name']} ({len(df_tmp)} rows)")
            print("-" * 40)
            
        except Exception as e:
            # If the specific filename fails, we fallback to a manual download
            print(f"Specific file load failed for {ds['name']}, trying general download...")
            try:
                path = kagglehub.dataset_download(ds['handle'])
                print(f"Dataset downloaded to cache: {path}")
                print(f"Please check the folder and update the 'file' name in the script.")
            except Exception as e_inner:
                print(f"Critical error on {ds['handle']}: {e_inner}")

In [None]:
# Execute the download
master_ingest()

--- Starting Master Ingestion to ../data/raw/ ---

Action: Downloading programmerrdai/nasdaq-100-ndx-historical-data-daily-prices...
Specific file load failed for nasdaq_history.csv, trying general download...
Downloading to C:\Users\Asus\.cache\kagglehub\datasets\programmerrdai\nasdaq-100-ndx-historical-data-daily-prices\1.archive...


100%|██████████| 44.0k/44.0k [00:00<00:00, 307kB/s]

Extracting files...
Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\programmerrdai\nasdaq-100-ndx-historical-data-daily-prices\versions\1
Please check the folder and update the 'file' name in the script.
Action: Downloading jacksoncrow/stock-market-dataset...





Downloading to C:\Users\Asus\.cache\kagglehub\datasets\jacksoncrow\stock-market-dataset\versions\2\stocks/AAPL.csv...


100%|██████████| 250k/250k [00:00<00:00, 271kB/s]

Extracting zip of AAPL.csv...





Status: Saved stock_market_full.csv (9909 rows)
----------------------------------------
Action: Downloading miguelaenlle/parsed-sec-10q-filings-since-2006...
Specific file load failed for sec_10q_parsed.csv, trying general download...
Downloading to C:\Users\Asus\.cache\kagglehub\datasets\miguelaenlle\parsed-sec-10q-filings-since-2006\1.archive...


100%|██████████| 4.34M/4.34M [00:04<00:00, 1.09MB/s]

Extracting files...





Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\miguelaenlle\parsed-sec-10q-filings-since-2006\versions\1
Please check the folder and update the 'file' name in the script.
Action: Downloading mann14/global-ai-and-data-science-job-market-20202026...
Specific file load failed for global_salary_benchmarks.csv, trying general download...
Downloading to C:\Users\Asus\.cache\kagglehub\datasets\mann14\global-ai-and-data-science-job-market-20202026\3.archive...


100%|██████████| 2.34M/2.34M [00:02<00:00, 1.16MB/s]

Extracting files...





Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\mann14\global-ai-and-data-science-job-market-20202026\versions\3
Please check the folder and update the 'file' name in the script.
Action: Downloading amar5693/india-job-market-and-salary-trends-2026...
Specific file load failed for india_salary_benchmarks.csv, trying general download...
Downloading to C:\Users\Asus\.cache\kagglehub\datasets\amar5693\india-job-market-and-salary-trends-2026\1.archive...


100%|██████████| 205k/205k [00:01<00:00, 196kB/s]

Extracting files...





Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\amar5693\india-job-market-and-salary-trends-2026\versions\1
Please check the folder and update the 'file' name in the script.
Action: Downloading verracodeguacas/sec-edgar-filings-index...
Specific file load failed for sec_filings_index.csv, trying general download...
Downloading to C:\Users\Asus\.cache\kagglehub\datasets\verracodeguacas\sec-edgar-filings-index\8.archive...


100%|██████████| 41.8M/41.8M [00:33<00:00, 1.31MB/s]

Extracting files...





Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\verracodeguacas\sec-edgar-filings-index\versions\8
Please check the folder and update the 'file' name in the script.
Action: Downloading yanmaksi/big-startup-secsees-fail-dataset-from-crunchbase...
Specific file load failed for crunchbase_startups.csv, trying general download...
Dataset downloaded to cache: C:\Users\Asus\.cache\kagglehub\datasets\yanmaksi\big-startup-secsees-fail-dataset-from-crunchbase\versions\1
Please check the folder and update the 'file' name in the script.
