In [1]:
#!/usr/bin/env python
"""
Simple script to read and display TikTok data from parquet files.
"""

import pandas as pd
from pathlib import Path
import os
import sys

In [2]:
OUT_DIR = Path("../tiktok_data")
print(OUT_DIR)
parquet_files = list(OUT_DIR.glob("*.parquet"))

if not parquet_files:
    print("No parquet files found in", OUT_DIR)
    exit()

# Sort by modification time (newest first)
parquet_files = sorted(parquet_files, key=os.path.getmtime, reverse=True)
print(parquet_files)
# Allow selecting a specific file from command line
if len(sys.argv) > 1 and sys.argv[1].endswith('.parquet'):
    target_file = Path(sys.argv[1])
    if target_file in parquet_files:
        selected_file = target_file
    else:
        print(f"File {sys.argv[1]} not found. Using most recent file instead.")
        selected_file = parquet_files[0]
else:
    # Use the most recent file by default
    selected_file = parquet_files[0]

print(f"Reading {selected_file}")

# Load the data
df = pd.read_parquet(selected_file)

../tiktok_data
[PosixPath('../tiktok_data/tiktok_beauty_dataset.parquet')]
Reading ../tiktok_data/tiktok_beauty_dataset.parquet


In [3]:
df.head()

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments
0,7485152103380389127,1742773000.0,Bubble took every impurities🫧 #skincare #korea...,7389943393130562577,heylina2484,99800,11383654,816847,42736,2857,0,../tiktok_data/thumbnails/7485152103380389127.jpg,"[i thought it was mold..., ITU PASTI GA READY ..."
1,7478688849296772359,1741268000.0,Bobacream… plus PDRN… and the result? 🧋➕🐟 It’s...,7288964931759457281,mintfactory1995,9474,1743649,76443,1821,388,0,../tiktok_data/thumbnails/7478688849296772359.jpg,"[POV: cewek natural yang cowok maksud..🗿, If I..."
2,7483154654604397842,1742308000.0,#skincare #skintok #kbeauty #koreanskincare #o...,7451563120218276880,malangcutiee,2592,2635129,153172,8892,972,0,../tiktok_data/thumbnails/7483154654604397842.jpg,"[Для подростковой кожи подойдет?, skin1004??, ..."
3,7485337053383625991,1742816000.0,My breakout routine pt.2 #acneproneskin #skinc...,7411074968203953169,frekkie59,8022,319306,47786,5522,48,0,../tiktok_data/thumbnails/7485337053383625991.jpg,"[is it really good... I'm scared, So close! Ur..."
4,7497508042016115976,1745650000.0,cleansing @skin1004 cleanser @celimax serum @l...,7380385709847036929,cami_kana,107200,2920533,260270,13394,1895,0,../tiktok_data/thumbnails/7497508042016115976.jpg,"[kulit mukanyaa impiann bangett, pengaruh skin..."


In [4]:
# Find rows where thumbnail path doesn't exist and add "../" prefix
mask = df['thumbnail_path'].apply(lambda x: not Path(x).exists())
df.loc[mask, 'thumbnail_path'] = df.loc[mask, 'thumbnail_path'].apply(lambda x: "../" + x)
df.loc[mask]

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments
0,7485152103380389127,1.742773e+09,Bubble took every impurities🫧 #skincare #korea...,7389943393130562577,heylina2484,99800,11383654,816847,42736,2857,0,../../tiktok_data/thumbnails/74851521033803891...,"[i thought it was mold..., ITU PASTI GA READY ..."
1,7478688849296772359,1.741268e+09,Bobacream… plus PDRN… and the result? 🧋➕🐟 It’s...,7288964931759457281,mintfactory1995,9474,1743649,76443,1821,388,0,../../tiktok_data/thumbnails/74786888492967723...,"[POV: cewek natural yang cowok maksud..🗿, If I..."
2,7483154654604397842,1.742308e+09,#skincare #skintok #kbeauty #koreanskincare #o...,7451563120218276880,malangcutiee,2592,2635129,153172,8892,972,0,../../tiktok_data/thumbnails/74831546546043978...,"[Для подростковой кожи подойдет?, skin1004??, ..."
3,7485337053383625991,1.742816e+09,My breakout routine pt.2 #acneproneskin #skinc...,7411074968203953169,frekkie59,8022,319306,47786,5522,48,0,../../tiktok_data/thumbnails/74853370533836259...,"[is it really good... I'm scared, So close! Ur..."
4,7497508042016115976,1.745650e+09,cleansing @skin1004 cleanser @celimax serum @l...,7380385709847036929,cami_kana,107200,2920533,260270,13394,1895,0,../../tiktok_data/thumbnails/74975080420161159...,"[kulit mukanyaa impiann bangett, pengaruh skin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10723,7473045587399560456,1.739954e+09,最強ベースコスメレシピ✨️ 乾燥肌よりの混合肌 この組み合わせが過去一ツヤ、夜まで粉浮きゼロ...,6810028410833486850,ri5luv,444,906798,40698,110,157,0,../tiktok_data/thumbnails/7473045587399560456.jpg,[]
10724,7420799245579537671,1.727789e+09,fweeのグロスを本音レビュー💋,7291011104704037890,mynameismegx,188100,2953675,63504,355,0,0,../tiktok_data/thumbnails/7420799245579537671.jpg,[]
10725,7305933680466283781,1.701045e+09,แป้งความลับนางฟ้าเสกผิวไบรท์ผ่องทั้งวัน คุมมัน...,6580661912760549378,mmiinn_0,4772,1495690,33685,785,67,0,../tiktok_data/thumbnails/7305933680466283781.jpg,[]
10726,7415517523459755280,1.726560e+09,無限涙袋♾🚞 #涙袋 #涙袋メイク #アイメイク #セザンヌ #コスメ #メイク #検証 #...,6616877356026036225,saraparin,290200,1550572,31638,153,119,0,../tiktok_data/thumbnails/7415517523459755280.jpg,[]


In [15]:
#df.loc[df['top_comments'].str.len() == 0]
df.loc[df['thumbnail_path']==""]

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments


In [6]:
len(df.loc[df['top_comments'].str.len() == 0])

10609

In [29]:
df['thumbnail_path'] = df['thumbnail_path'].apply(lambda x: x.replace('../', ''))

In [7]:
len(df)

10728