In [5]:
import requests
import pandas as pd

# api Endpoint and parameters
BASE_URL = "https://datasets-server.huggingface.co/rows"
DATASET = "nateraw/us-accidents"
CONFIG = "default"
SPLIT = "train"
BATCH_SIZE = 100 # num of rows per request

offset = 0
all_data = []
total_rows = None

print("Fetching data in batches...")

while True:
    # api request
    params = {
        "dataset": DATASET,
        "config": CONFIG,
        "split": SPLIT,
        "offset": offset,
        "length": BATCH_SIZE,
    }
    response = requests.get(BASE_URL, params=params)

    if response.status_code != 200:
        print(f"Failed to fetch data: {response.status_code}, {response.text}")
        break

    data = response.json()

    rows = data.get("rows", [])
    total_rows = data.get("dataset", {}).get("num_rows", None)

    if not rows:
        print("No more rows to fetch.")
        break

    df = pd.DataFrame([row["row"] for row in rows])
    all_data.append(df)

    offset += BATCH_SIZE

    print(f"Fetched {len(rows)} rows. Total fetched so far: {offset}")

    if total_rows and offset >= total_rows:
        break

if all_data:
    full_dataset = pd.concat(all_data, ignore_index=True)
    print(f"Total rows fetched: {len(full_dataset)}")
else:
    print("No data fetched.")

csv_file = "us_accidents_full_dataset.csv"
full_dataset.to_csv(csv_file, index=False)
print(f"Dataset saved to {csv_file}")

print("Dataset Info:")
full_dataset.info()

print("\nFirst Few Rows:")
print(full_dataset.head())

Fetching data in batches...
Fetched 100 rows. Total fetched so far: 100
Fetched 100 rows. Total fetched so far: 200
Fetched 100 rows. Total fetched so far: 300
Fetched 100 rows. Total fetched so far: 400
Fetched 100 rows. Total fetched so far: 500
Fetched 100 rows. Total fetched so far: 600
Fetched 100 rows. Total fetched so far: 700
Fetched 100 rows. Total fetched so far: 800
Fetched 100 rows. Total fetched so far: 900
Fetched 100 rows. Total fetched so far: 1000
Fetched 100 rows. Total fetched so far: 1100
Fetched 100 rows. Total fetched so far: 1200
Fetched 100 rows. Total fetched so far: 1300
Fetched 100 rows. Total fetched so far: 1400
Fetched 100 rows. Total fetched so far: 1500
Fetched 100 rows. Total fetched so far: 1600
Fetched 100 rows. Total fetched so far: 1700
Fetched 100 rows. Total fetched so far: 1800
Fetched 100 rows. Total fetched so far: 1900
Fetched 100 rows. Total fetched so far: 2000
Fetched 100 rows. Total fetched so far: 2100
Fetched 100 rows. Total fetched so f

  full_dataset = pd.concat(all_data, ignore_index=True)


Total rows fetched: 294600
Dataset saved to us_accidents_full_dataset.csv
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294600 entries, 0 to 294599
Data columns (total 47 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   ID                     294600 non-null  object 
 1   Severity               294600 non-null  int64  
 2   Start_Time             294600 non-null  object 
 3   End_Time               294600 non-null  object 
 4   Start_Lat              294600 non-null  float64
 5   Start_Lng              294600 non-null  float64
 6   End_Lat                294600 non-null  float64
 7   End_Lng                294600 non-null  float64
 8   Distance(mi)           294600 non-null  float64
 9   Description            294600 non-null  object 
 10  Number                 77463 non-null   float64
 11  Street                 294600 non-null  object 
 12  Side                   294600 non-null  object 
 13  C