# Backfill Hugging Face Dataset with De-duplicated CSV data 

In [47]:
import pandas as pd
import os
from huggingface_hub import HfApi
import dotenv
import pyarrow
dotenv.load_dotenv()

True

### Import Files

In [30]:
files = ["2025-04-14.csv","2025-04-15.csv","2025-04-16.csv","2025-04-17.csv"]

In [31]:
df = pd.DataFrame()
for file in files:
    df = pd.concat([df, pd.read_csv(f"./data/{file}")])

In [32]:
df[lambda x: x.post_id == "1jzdubo"]

Unnamed: 0,subreddit,created_at,retrieved_at,type,text,score,post_id,parent_id
105,GooglePixel,2025-04-14 18:57:19-05:00,2025-04-14 23:44:33.195154-05:00,post,Pixel 9a - USB C DAC issues\n\nI got a 9a afte...,5,1jzdubo,
79,GooglePixel,2025-04-14 18:57:19-05:00,2025-04-15 16:28:20.955459-05:00,post,Pixel 9a - USB C DAC issues\n\nI got a 9a afte...,10,1jzdubo,


In [22]:
df['post_id_count'] = df.groupby('post_id')['post_id'].transform('count')

In [26]:
df['retrieved_at'] = pd.to_datetime(df['retrieved_at'])

# count how many fall on 2025‑04‑15
count_0415 = (df['retrieved_at'].dt.date == pd.to_datetime('2025-04-15').date()).sum()

print(count_0415)

299


### Script

In [48]:
# --- CONFIGURE THESE ---
REPO_ID    = "hblim/top_reddit_posts_daily"
REPO_TYPE  = "dataset"
# ------------------------

def dedupe_keep_earliest(df, id_col="post_id", time_col="retrieved_at"):
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    return (
        df
        .sort_values(time_col)
        .drop_duplicates(subset=id_col, keep="first")
        .reset_index(drop=True)
    )

def upload_daily_slices_parquet(df, repo_id, repo_type="dataset"):
    api = HfApi()  # make sure you've authenticated (e.g. `huggingface-cli login`)
    df["date"] = df["retrieved_at"].dt.date

    for date, group in df.groupby("date"):
        filename     = f"posts_{date}.parquet"
        path_in_repo = f"data_raw/{date}.parquet"
        
        # write Parquet
        group.to_parquet(filename, index=False)
        
        # upload to HF
        api.upload_file(
            path_or_fileobj=filename,
            path_in_repo=path_in_repo,
            repo_id=repo_id,
            repo_type=repo_type,
        )
        print(f"Uploaded {len(group)} rows for {date} → {path_in_repo}")
        
        # cleanup
        os.remove(filename)


df_clean = dedupe_keep_earliest(df)

# 3. write & push daily Parquet files
upload_daily_slices_parquet(df_clean, REPO_ID, REPO_TYPE)

posts_2025-04-14.parquet: 100%|████████████████████████████████████████████████████| 74.1k/74.1k [00:00<00:00, 423kB/s]


Uploaded 312 rows for 2025-04-14 → data_raw/2025-04-14.parquet


posts_2025-04-15.parquet: 100%|████████████████████████████████████████████████████| 58.9k/58.9k [00:00<00:00, 596kB/s]


Uploaded 258 rows for 2025-04-15 → data_raw/2025-04-15.parquet


posts_2025-04-16.parquet: 100%|████████████████████████████████████████████████████| 81.9k/81.9k [00:00<00:00, 381kB/s]


Uploaded 330 rows for 2025-04-16 → data_raw/2025-04-16.parquet


posts_2025-04-17.parquet: 100%|███████████████████████████████████████████████████| 71.0k/71.0k [00:00<00:00, 1.16MB/s]


Uploaded 324 rows for 2025-04-17 → data_raw/2025-04-17.parquet
