# Backfill Hugging Face Dataset Repo
Utility notebook to backfill data using local files

In [47]:
import pandas as pd
import os
from huggingface_hub import HfApi
import dotenv
import pyarrow
dotenv.load_dotenv()

True

### Import Files

In [64]:
files = ["2025-04-14.csv","2025-04-15.csv","2025-04-16.csv","2025-04-17.csv"]

In [65]:
df = pd.DataFrame()
for file in files:
    df = pd.concat([df, pd.read_csv(f"./data/{file}")])

In [66]:
df[lambda x: x.post_id == "1jzdubo"]

Unnamed: 0,subreddit,created_at,retrieved_at,type,text,score,post_id,parent_id
105,GooglePixel,2025-04-14 18:57:19-05:00,2025-04-14 23:44:33.195154-05:00,post,Pixel 9a - USB C DAC issues\n\nI got a 9a afte...,5,1jzdubo,
79,GooglePixel,2025-04-14 18:57:19-05:00,2025-04-15 16:28:20.955459-05:00,post,Pixel 9a - USB C DAC issues\n\nI got a 9a afte...,10,1jzdubo,


In [67]:
df['post_id_count'] = df.groupby('post_id')['post_id'].transform('count')

In [68]:
df['retrieved_at'] = pd.to_datetime(df['retrieved_at'])

# count how many fall on 2025‑04‑15
count_0415 = (df['retrieved_at'].dt.date == pd.to_datetime('2025-04-15').date()).sum()

print(count_0415)

299


### Script

In [90]:
# --- CONFIGURE THESE ---
REPO_ID    = "hblim/top_reddit_posts_daily"
REPO_TYPE  = "dataset"
# ------------------------

def dedupe_keep_earliest(df, id_col="post_id", time_col="retrieved_at"):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    return (
        df
        .sort_values(time_col)
        .drop_duplicates(subset=id_col, keep="first")
        .reset_index(drop=True)
    )

def upload_daily_slices_parquet(df, repo_id, repo_type="dataset"):
    api = HfApi()  # make sure you've authenticated (e.g. `huggingface-cli login`)
    df["date"] = df["retrieved_at"].dt.date

    for date, group in df.groupby("date"):
        del group['date']
        del group['post_id_count']
        filename     = f"posts_{date}.parquet"
        path_in_repo = f"data_raw/{date}.parquet"
        
        # write Parquet
        group.to_parquet(filename, index=False)
        
        # upload to HF
        api.upload_file(
            path_or_fileobj=filename,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            repo_type=repo_type,
        )
        print(f"Uploaded {len(group)} rows for {date} → {path_in_repo}")
        
        # cleanup
        os.remove(filename)


df_clean = dedupe_keep_earliest(df)

# 3. write & push daily Parquet files
upload_daily_slices_parquet(df_clean, REPO_ID, REPO_TYPE)

No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded 312 rows for 2025-04-14 → data_raw/2025-04-14.parquet


No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded 258 rows for 2025-04-15 → data_raw/2025-04-15.parquet
Uploaded 330 rows for 2025-04-16 → data_raw/2025-04-16.parquet


No files have been modified since last commit. Skipping to prevent empty commit.


Uploaded 324 rows for 2025-04-17 → data_raw/2025-04-17.parquet


### Load Data

In [78]:
api = HfApi()
repo_id = "hblim/top_reddit_posts_daily"

# ——— Download and load today's shard ———
date_str = "2025-04-18"
today_path = api.hf_hub_download(
    repo_id=repo_id,
    filename=f"data_raw/{date_str}.parquet",
    repo_type="dataset"
)
df_today = pd.read_parquet(today_path)
print(f"Records for {date_str}:")
df_today.head()

Records for 2025-04-18:


Unnamed: 0,subreddit,created_at,retrieved_at,type,text,score,post_id,parent_id
0,apple,2025-04-17 19:59:44-05:00,2025-04-18 12:46:10.631577-05:00,post,Apple wanted people to vibe code Vision Pro ap...,427,1k1sn9w,
1,apple,2025-04-17 20:17:24-05:00,2025-04-18 12:46:10.631577-05:00,comment,"Using Siri? You want me to build, test and rel...",793,mnor2mf,t3_1k1sn9w
2,apple,2025-04-17 20:02:06-05:00,2025-04-18 12:46:10.631577-05:00,comment,Wtf is vibe coding?? \n \nWe're reaching incre...,216,mnoom31,t3_1k1sn9w
3,apple,2025-04-17 20:05:37-05:00,2025-04-18 12:46:10.631577-05:00,comment,What.,154,mnop6rz,t3_1k1sn9w
4,apple,2025-04-17 20:37:08-05:00,2025-04-18 12:46:10.631577-05:00,comment,Here's something I found on the web about vibe...,132,mnou85s,t3_1k1sn9w


In [79]:

api = HfApi()
repo_id = "hblim/top_reddit_posts_daily"

# 1. List all parquet files in the dataset repo
all_files = api.list_repo_files(repo_id, repo_type="dataset")
parquet_files = sorted([f for f in all_files if f.startswith("data_raw/") and f.endswith(".parquet")])

# 2. Download each shard and load with pandas
dfs = []
for shard in parquet_files:
    local_path = api.hf_hub_download(repo_id=repo_id, filename=shard, repo_type="dataset")
    dfs.append(pd.read_parquet(local_path))

# 3. Concatenate into one DataFrame
df_all = pd.concat(dfs, ignore_index=True)
print(f"Total records across {len(dfs)} days: {len(df_all)}")

Total records across 5 days: 1443
