In [1]:
import requests
import json
import pandas as pd
from pathlib import Path
import time
from typing import Dict, List, Optional

In [2]:
class HuggingFaceAPIDownloader:
    def __init__(self, dataset_name="ithieund/VietNews-Abs-Sum", data_dir="./data"):
        self.dataset_name = dataset_name
        self.data_dir = Path(data_dir)
        self.raw_dir = self.data_dir / "raw"
        self.processed_dir = self.data_dir / "processed"
        
        self.raw_dir.mkdir(parents=True, exist_ok=True)
        self.processed_dir.mkdir(parents=True, exist_ok=True)
        
        # API endpoints
        self.base_url = "https://datasets-server.huggingface.co"
        self.dataset_url = dataset_name.replace("/", "%2F")
        
    def get_dataset_info(self) -> Optional[Dict]:
        try:
            url = f"{self.base_url}/info?dataset={self.dataset_url}"
            print(f"Fetching dataset info...")
            
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            info = response.json()
            
            info_path = self.raw_dir / "dataset_info.json"
            with open(info_path, 'w', encoding='utf-8') as f:
                json.dump(info, f, indent=2, ensure_ascii=False)
            
            print("Dataset info retrieved")
            return info
            
        except Exception as e:
            print(f"Error getting dataset info: {e}")
            return None
    
    def get_dataset_splits(self) -> List[str]:
        try:
            url = f"{self.base_url}/splits?dataset={self.dataset_url}"
            print("Fetching available splits...")
            
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            
            splits_info = response.json()
            splits = [split['split'] for split in splits_info['splits']]
            
            print(f"Available splits: {splits}")
            return splits
            
        except Exception as e:
            print(f"Error getting splits: {e}")
            return ['train', 'test']  # fallback
    
    def get_first_rows(self, split="train", length=100) -> Optional[Dict]:
        try:
            url = f"{self.base_url}/first-rows"
            params = {
                'dataset': self.dataset_name,
                'config': 'default',
                'split': split,
                'length': length
            }
            print(f"Fetching first {length} rows from {split} split...")
            
            response = requests.get(url, params=params, timeout=60)
            response.raise_for_status()
            
            data = response.json()
            print(f"Retrieved {len(data['rows'])} rows from {split}")
            return data
            
        except Exception as e:
            print(f"Error getting first rows for {split}: {e}")
            return None
    
    def download_all_data_paginated(self, split="train", batch_size=100, max_rows=None):
        try:
            print(f"Starting paginated download for {split} split...")
            all_rows = []
            offset = 0
            
            while True:
                url = f"{self.base_url}/rows"
                params = {
                    'dataset': self.dataset_name,
                    'config': 'default',
                    'split': split,
                    'offset': offset,
                    'length': batch_size
                }
                print(f"Fetching rows {offset} to {offset + batch_size}...")
                response = requests.get(url, params=params, timeout=60)
                
                if response.status_code == 404:
                    print("No more data available")
                    break
                    
                response.raise_for_status()
                data = response.json()
                
                batch_rows = data['rows']
                if not batch_rows:
                    print("No more rows to fetch")
                    break
                
                all_rows.extend(batch_rows)
                print(f"Total rows collected: {len(all_rows)}")
                
                if max_rows and len(all_rows) >= max_rows:
                    all_rows = all_rows[:max_rows]
                    print(f"Reached maximum rows limit: {max_rows}")
                    break
                
                offset += batch_size
                time.sleep(1)  # Be nice to the API
                
                # Break if we got less than batch_size (likely the end)
                if len(batch_rows) < batch_size:
                    break
            
            return all_rows
            
        except Exception as e:
            print(f"Error during paginated download: {e}")
            return []
    
    def process_and_save_data(self, rows: List[Dict], split: str):
        if not rows:
            print(f"No data to save for {split}")
            return
        processed_rows = []
        for row in rows:
            if 'row' in row:
                processed_rows.append(row['row'])
            else:
                processed_rows.append(row)
        
        # Save as JSON
        json_path = self.raw_dir / f"{split}.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(processed_rows, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(processed_rows)} rows to {json_path}")
        
        # Save as CSV
        try:
            df = pd.DataFrame(processed_rows)
            csv_path = self.raw_dir / f"{split}.csv"
            df.to_csv(csv_path, index=False, encoding='utf-8')
            print(f"Saved CSV to {csv_path}")
            
            # Show basic info
            print(f"{split.upper()} Split Info:")
            print(f"Rows: {len(df)}")
            print(f"Columns: {list(df.columns)}")
            
            # Show sample
            if len(df) > 0:
                print(f"Sample data:")
                for col in df.columns:
                    sample_val = str(df[col].iloc[0])
                    preview = sample_val[:100] + "..." if len(sample_val) > 100 else sample_val
                    print(f"     {col}: {preview}")
            
        except Exception as e:
            print(f"Could not create CSV: {e}")
    
    def download_complete_dataset(self, max_rows_per_split=None):
        print("🇻🇳 VietNews Dataset Download via HuggingFace API")
        print("=" * 60)
        
        print("\n1. Getting dataset information...")
        dataset_info = self.get_dataset_info()
        
        print("\n2. Getting available splits...")
        splits = self.get_dataset_splits()
        
        print("\n3. Downloading data splits...")
        for split in splits:
            print(f"\n--- Downloading {split.upper()} split ---")
            
            try:
                rows = self.download_all_data_paginated(
                    split=split, 
                    batch_size=100, 
                    max_rows=max_rows_per_split
                )
                
                if rows:
                    self.process_and_save_data(rows, split)
                else:
                    # Fallback to first-rows if full download fails
                    print(f"Falling back to first-rows for {split}...")
                    data = self.get_first_rows(split=split, length=100)
                    if data:
                        self.process_and_save_data(data['rows'], split)
                
            except Exception as e:
                print(f"Error downloading {split}: {e}")
                continue
        
        print("\n4. Download Summary:")
        print("-" * 30)
        
        files = list(self.raw_dir.glob("*"))
        for file_path in files:
            print(f"{file_path.name} ({file_path.stat().st_size / 1024:.1f} KB)")
        
        print(f"\nDownload completed!")
        print(f"Data location: {self.data_dir}")
    
    def quick_test_api(self):
        print("Testing HuggingFace API...")
        try:
            url = f"{self.base_url}/first-rows"
            params = {
                'dataset': self.dataset_name,
                'config': 'default',
                'split': 'train',
                'length': 5
            }
            
            response = requests.get(url, params=params, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            print(f"API working! Got {len(data['rows'])} sample rows")
            
            # Show structure
            if data['rows']:
                sample_row = data['rows'][0]
                if 'row' in sample_row:
                    print(f"Sample columns: {list(sample_row['row'].keys())}")
                else:
                    print(f"Sample columns: {list(sample_row.keys())}")
            return True
            
        except Exception as e:
            print(f"API test failed: {e}")
            return False


In [3]:
def quick_download():
    downloader = HuggingFaceAPIDownloader()
    if downloader.quick_test_api():
        downloader.download_complete_dataset(max_rows_per_split=1000)  # Limit for testing
    else:
        print("API not accessible. Please check your internet connection.")

def full_download():
    downloader = HuggingFaceAPIDownloader()
    downloader.download_complete_dataset()  # No limits

In [4]:
print("Choose download option:")
print("1. Quick test (1000 rows per split)")
print("2. Full download (all data)")
    
choice = input("Enter choice (1 or 2): ").strip()
    
if choice == "1":
    quick_download()
elif choice == "2":
    full_download()
else:
    print("Invalid choice. Running quick test...")
    quick_download()

Choose download option:
1. Quick test (1000 rows per split)
2. Full download (all data)
Testing HuggingFace API...
API working! Got 62 sample rows
Sample columns: ['guid', 'title', 'abstract', 'article']
🇻🇳 VietNews Dataset Download via HuggingFace API

1. Getting dataset information...
Fetching dataset info...
Dataset info retrieved

2. Getting available splits...
Fetching available splits...
Available splits: ['train', 'validation', 'test']

3. Downloading data splits...

--- Downloading TRAIN split ---
Starting paginated download for train split...
Fetching rows 0 to 100...
Total rows collected: 100
Fetching rows 100 to 200...
Total rows collected: 200
Fetching rows 200 to 300...
Total rows collected: 300
Fetching rows 300 to 400...
Total rows collected: 400
Fetching rows 400 to 500...
Total rows collected: 500
Fetching rows 500 to 600...
Total rows collected: 600
Fetching rows 600 to 700...
Total rows collected: 700
Fetching rows 700 to 800...
Total rows collected: 800
Fetching row