In [10]:
#!/usr/bin/env python
"""
Simple script to read and display TikTok data from parquet files.
"""

import pandas as pd
from pathlib import Path
import os
import sys

In [11]:
OUT_DIR = Path("../tiktok_data")
print(OUT_DIR)
parquet_files = list(OUT_DIR.glob("*.parquet"))

if not parquet_files:
    print("No parquet files found in", OUT_DIR)
    exit()

# Sort by modification time (newest first)
parquet_files = sorted(parquet_files, key=os.path.getmtime, reverse=True)
print(parquet_files)
# Allow selecting a specific file from command line
if len(sys.argv) > 1 and sys.argv[1].endswith('.parquet'):
    target_file = Path(sys.argv[1])
    if target_file in parquet_files:
        selected_file = target_file
    else:
        print(f"File {sys.argv[1]} not found. Using most recent file instead.")
        selected_file = parquet_files[0]
else:
    # Use the most recent file by default
    selected_file = parquet_files[0]

print(f"Reading {selected_file}")

# Load the data
df = pd.read_parquet(selected_file)

../tiktok_data
[PosixPath('../tiktok_data/tiktok_beauty_dataset.parquet')]
Reading ../tiktok_data/tiktok_beauty_dataset.parquet


In [39]:
df.head()

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments
0,7485152103380389127,1742773000.0,Bubble took every impurities🫧 #skincare #korea...,7389943393130562577,heylina2484,99800,11383654,816847,42736,2857,0,../tiktok_data/thumbnails/7485152103380389127.jpg,"[i thought it was mold..., ITU PASTI GA READY ..."
1,7478688849296772359,1741268000.0,Bobacream… plus PDRN… and the result? 🧋➕🐟 It’s...,7288964931759457281,mintfactory1995,9474,1743649,76443,1821,388,0,../tiktok_data/thumbnails/7478688849296772359.jpg,"[POV: cewek natural yang cowok maksud..🗿, If I..."
2,7483154654604397842,1742308000.0,#skincare #skintok #kbeauty #koreanskincare #o...,7451563120218276880,malangcutiee,2592,2635129,153172,8892,972,0,../tiktok_data/thumbnails/7483154654604397842.jpg,"[Для подростковой кожи подойдет?, skin1004??, ..."
3,7485337053383625991,1742816000.0,My breakout routine pt.2 #acneproneskin #skinc...,7411074968203953169,frekkie59,8022,319306,47786,5522,48,0,../tiktok_data/thumbnails/7485337053383625991.jpg,"[is it really good... I'm scared, So close! Ur..."
4,7497508042016115976,1745650000.0,cleansing @skin1004 cleanser @celimax serum @l...,7380385709847036929,cami_kana,107200,2920533,260270,13394,1895,0,../tiktok_data/thumbnails/7497508042016115976.jpg,"[kulit mukanyaa impiann bangett, pengaruh skin..."


In [6]:
# Find rows where thumbnail path doesn't exist and add "../" prefix
mask = df['thumbnail_path'].apply(lambda x: not Path(x).exists())
df.loc[mask, 'thumbnail_path'] = df.loc[mask, 'thumbnail_path'].apply(lambda x: "../" + x)
df.loc[mask]

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments
219,7493807194861800759,1.744788e+09,Yang berisi jg gamau ama lu bang ☺️🙏🏻,6794202507020403713,anakmamamitaa,333600,20810,464,7,8,0,../tiktok_data/thumbnails/7493807194861800759.jpg,[]
220,7488752348186103048,1.743611e+09,#thr 😂😂😂@ayyagoreng @moena harharah,6729865179024098305,jihanharharaaa,391900,12826,183,11,5,0,../tiktok_data/thumbnails/7488752348186103048.jpg,[]
221,7472300275156421896,1.739780e+09,아침 스킨케어: 새학기 전에 트러블 빨리 없애는 루틴❤️☀️ 석시닉 패드 관련해서 ...,7451052229820449793,m1nsxeo,1945,1285235,6339,215,257,0,../tiktok_data/thumbnails/7472300275156421896.jpg,[]
222,7481378810148015382,1.741894e+09,🧊💦💧#fyp #pimples #water #skincare,7451751204190061590,juliaamtrp,514,2186080,234003,13322,1345,0,../tiktok_data/thumbnails/7481378810148015382.jpg,[]
223,7484674587767360775,1.742662e+09,double cleansing time⏰️ #skincare #kbeauty #ks...,6997275258290652162,dongzik_skin,56200,3395869,120349,9992,429,0,../tiktok_data/thumbnails/7484674587767360775.jpg,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8336,7461220121030184208,1.737201e+09,Korean everyday no makeup makeup must haves fr...,6672666555246265346,l_yuhann,653200,1137447,116576,2260,179,0,../tiktok_data/thumbnails/7461220121030184208.jpg,[]
8337,7480827551217224967,1.741766e+09,色んなメイクするのだいすき#06 #fyp,7094212901486150657,12vul9,39000,172189,17926,67,54,0,../tiktok_data/thumbnails/7480827551217224967.jpg,[]
8338,7419253076047023377,1.727662e+09,ウォニョンのメイク👧💄#韓国アイドル #メイク #すっぴん #垢抜け,7412569931830805510,yuuui_akanuke,180,209864,7013,33,14,0,../tiktok_data/thumbnails/7419253076047023377.jpg,[]
8339,7502806144163826951,1.746883e+09,ナチュラル裸眼メイク🎀,7347675480616895495,_sw718,55200,20486,688,2,3,0,../tiktok_data/thumbnails/7502806144163826951.jpg,[]


In [15]:
#df.loc[df['top_comments'].str.len() == 0]
df.loc[df['thumbnail_path']==""]

Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments


In [11]:
len(df.loc[df['top_comments'].str.len() == 0])

906

In [29]:
df['thumbnail_path'] = df['thumbnail_path'].apply(lambda x: x.replace('../', ''))

In [12]:
len(df)

10341

In [14]:
# Check for duplicates
print("Number of duplicate video_ids:", df['video_id'].duplicated().sum())

# Drop duplicates based on video_id and show remaining data
df = df.drop_duplicates(subset=['video_id'])
df.head()

Number of duplicate video_ids: 0


Unnamed: 0,video_id,posted_ts,description,author_id,author_name,follower_count,view_count,like_count,share_count,comment_count,repost_count,thumbnail_path,top_comments
0,7485152103380389127,1742773000.0,Bubble took every impurities🫧 #skincare #korea...,7389943393130562577,heylina2484,99800,11383654,816847,42736,2857,0,../tiktok_data/thumbnails/7485152103380389127.jpg,"[i thought it was mold..., ITU PASTI GA READY ..."
1,7478688849296772359,1741268000.0,Bobacream… plus PDRN… and the result? 🧋➕🐟 It’s...,7288964931759457281,mintfactory1995,9474,1743649,76443,1821,388,0,../tiktok_data/thumbnails/7478688849296772359.jpg,"[POV: cewek natural yang cowok maksud..🗿, If I..."
2,7483154654604397842,1742308000.0,#skincare #skintok #kbeauty #koreanskincare #o...,7451563120218276880,malangcutiee,2592,2635129,153172,8892,972,0,../tiktok_data/thumbnails/7483154654604397842.jpg,"[Для подростковой кожи подойдет?, skin1004??, ..."
3,7485337053383625991,1742816000.0,My breakout routine pt.2 #acneproneskin #skinc...,7411074968203953169,frekkie59,8022,319306,47786,5522,48,0,../tiktok_data/thumbnails/7485337053383625991.jpg,"[is it really good... I'm scared, So close! Ur..."
4,7497508042016115976,1745650000.0,cleansing @skin1004 cleanser @celimax serum @l...,7380385709847036929,cami_kana,107200,2920533,260270,13394,1895,0,../tiktok_data/thumbnails/7497508042016115976.jpg,"[kulit mukanyaa impiann bangett, pengaruh skin..."


In [5]:
len(df)

8325