# TikTok Data Consolidation Test

In [34]:
import json
import pandas as pd
from pprint import PrettyPrinter
import glob
from tqdm.notebook import tqdm
import os

In [2]:
data_path = '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw'
with open(f'{data_path}/metoo180_2023-12-13.json', 'r') as f:
    x = json.load(f)

In [4]:
pp = PrettyPrinter(indent=4)

In [27]:
files = glob.glob('/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/*.json')
pp.pprint(files)

[   '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metoospanish180_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metookorean90_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metoohebrew180_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metooitalian90_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metooitalian180_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metooarabic180_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metoohebrew90_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metoospanish90_2023-12-13.json',
    '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/metoochinese180_2023-12-13.json',
    '/data/inet-la

In [30]:
consolidate_dict = {}
for file in tqdm(files):
    with open(file, 'r') as f:
        x = json.load(f)
    
    for item in x['data']:
        video_id = item['aweme_info']['aweme_id']
        author_user_id = item['aweme_info']["author_user_id"]
        desc = item['aweme_info']["desc"]
        desc_lang = item['aweme_info']["desc_language"]
        
        consolidate_dict[video_id]={
            'video_id':video_id,
            'author_id':author_user_id,
            'description':desc,
            'description_lang':desc_lang
        }

  0%|          | 0/24 [00:00<?, ?it/s]

In [31]:
print(len(consolidate_dict))

5776


In [46]:
metadata_df = pd.DataFrame.from_dict(consolidate_dict, orient='index').reset_index(drop=True)

In [36]:
# Step 1: Read .txt files and store in a dictionary
folder_path = '/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/videos_transcribed/'  # Update with the path to your .txt files
text_data = {}
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data[filename[:-4]] = file.read()  # Remove '.txt' from filename
print('data read in')

  0%|          | 0/5685 [00:00<?, ?it/s]

data read in


In [38]:
assert set(text_data.keys()).issubset(set(consolidate_dict.keys()))

In [51]:
text_df = pd.DataFrame.from_dict(text_data, orient='index', columns=['text'])
text_df.reset_index(inplace=True)
text_df.rename(columns={'index': 'video_id'}, inplace=True)

# Merge with metadata
final_dataset = pd.merge(metadata_df, text_df, on='video_id', how='inner')

# Step 4: Your dataset is now ready for BERTopic or other NLP processing
#print(final_dataset.head())

final_dataset.to_parquet('/data/inet-large-scale-twitter-diffusion/ball4321/data_c/TikTok/01_raw/videos_transcribed.parquet')
print('Save complete. Ending.')

Save complete. Ending.


In [50]:
final_dataset

Unnamed: 0,video_id,author_id,description,description_lang,text
0,7312089052117912878,7308770060517098542,▶▶▶▶▶Conectar▶▶▶▶▶Conectar▶▶▶▶▶@hacerclic11223...,es,you
1,7312088080997944619,7303820804470817835,𝕙𝕖𝕝𝕝𝕠ℍ𝕒𝕔𝕖𝕣 𝕔𝕝𝕚𝕔➡➡➡➡@peterwilliamsmd4 @dous_wei...,un,you
2,7311876706988772654,7308904296814249006,▶▶▶▶▶Conectar▶▶▶▶▶@hacerclic666888999 @alvaro....,es,you
3,7311874448993553710,7309409722390283310,▶▶▶▶▶Conectar▶▶▶▶▶Conectar▶▶▶▶▶@hacerclic11223...,es,you
4,7311874097258974507,7309447113259713582,▶▶▶▶▶Conectar▶▶▶▶▶Conectar▶▶▶▶▶@hacerclic11223...,es,you
...,...,...,...,...,...
5680,7279349256312638728,7128629080081662978,今回のオフ会は、ザ・リッツ・カールトン大阪❗️沢山のご来場ありがとうございました😎ここで沢山...,ja,こんなホテルより絶対こういうことの方がいいよ. だってそうとっても. うははは. もうなんか...
5681,7279346783124212993,7131561219089499138,【母まじか...笑】やる気は無いけど食欲ある時のやり方らしいw #母親あるある #餃子食べた...,ja,母の餃子洗い物すくのすぎ. 最後は本当びっくり. まずプリ袋に好きなみじん切り野菜入れて. ...
5682,7279268956513242369,7063454749559817217,リルおじしゃんのモデレーターをしてます天使と申します。 毎日21時以降にLIVE配信もしてま...,ja,えー、はじめまして、えー、わたくしが. いい気にくいだよ. いい気にくいです. やっさーな、...
5683,7279108760558226693,7266674368997000198,#ad NO ES LA TRADUCCIÓN REAL!! /soy esa ᐠ( ᐛ )...,es,持てまくるよ. 鏡よ鏡 正直言われたいんだ. いらないお前 いらない. 鏡よ鏡 正直すぎるだ...
