# TikTok Data Consolidation

Taking the multiple search jsons and consolidating into one, collecting relevant information along the way.

In [1]:
import json
import pandas as pd
from pprint import PrettyPrinter
import glob
from tqdm.notebook import tqdm
import os
import sys
from pathlib import Path

pp = PrettyPrinter(indent=4)

if sys.platform.startswith('linux'):
    SYSTEM='linux'
elif sys.platform.startswith('darwin'):
    SYSTEM='mac'
else:
    print("You are on a different system: ", sys.platform)

In [2]:
data_path_arc = Path('/data/inet-large-scale-twitter-diffusion/ball4321/data_c/')
data_path_local= '../local_nogit/Data/'

DATA_PATH = Path(f'{data_path_arc if SYSTEM == "linux" else data_path_local}')
print(f'DATA PATH SET TO {DATA_PATH}')

with open(DATA_PATH / f'TikTok/01_raw/metoo180_2023-12-13.json', 'r') as f:
    x = json.load(f)

DATA PATH SET TO ../local_nogit/Data


In [3]:
files = glob.glob(str(DATA_PATH / 'TikTok/01_raw/*.json'))
pp.pprint(files)

[   '../local_nogit/Data/TikTok/01_raw/metoofrench90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoojapanese90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metooitalian180_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoohebrew180_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoochinese90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metooarabic90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoofrench180_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoo90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoojapanese180_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metookorean180_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metooitalian90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoorussian90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metoovarious90_2023-12-13.json',
    '../local_nogit/Data/TikTok/01_raw/metookorean90_2023-12-13.json'

In [5]:
consolidate_dict = {}
for file in tqdm(files):
    with open(file, 'r') as f:
        x = json.load(f)

    for item in x['data']:
        video_id = item['aweme_info']['aweme_id']
        author_user_id = item['aweme_info']["author_user_id"]
        author_name = item['aweme_info']['author']
        desc = item['aweme_info']["desc"]
        desc_lang = item['aweme_info']["desc_language"]

        # 31-01-24: add music so that we can compare later and use as filter?
        music_album = item['aweme_info'].get('music', {}).get('album', None)
        music_idstr = item['aweme_info'].get('music', {}).get('id_str', None)
        music_title = item['aweme_info'].get('music', {}).get('title', None)
        music_matchedsong_title = item['aweme_info'].get('music', {}).get('matched_song', {}).get('title')
        music_matchedsong_idstr = item['aweme_info'].get('music', {}).get('matched_song', {}).get('id')


        consolidate_dict[video_id]={
            'video_id': video_id,
            'author_id': author_user_id,
            'author_name': author_name,
            'description': desc,
            'description_lang': desc_lang,
            'music_album': music_album,
            'music_idstr': music_idstr,
            'music_title': music_title,
            'music_matchedsong_title': music_matchedsong_title,
            'music_matchedsong_idstr': music_matchedsong_idstr,
        }

print(f'Length of consolidated dict: {len(consolidate_dict)}')

  0%|          | 0/24 [00:00<?, ?it/s]

Length of consolidated dict: 5776


In [6]:
metadata_df = pd.DataFrame.from_dict(consolidate_dict, orient='index').reset_index(drop=True)

In [11]:
metadata_df[metadata_df['author_id']==7185717406458921989]['author_name'].iloc[0]

{'uid': '7185717406458921989',
 'short_id': '0',
 'nickname': 'Nicolas.Malone Spotify',
 'gender': 0,
 'signature': '👉Stream la music Nicolas.Malone sur Spotify, Apple,Deezer 👇',
 'avatar_larger': {'uri': 'tos-maliva-avt-0068/a630bb84828043d9f4a6f209f585d6bb',
  'url_list': ['https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/a630bb84828043d9f4a6f209f585d6bb~c5_1080x1080.webp?x-expires=1702566000&x-signature=2vDgZnHccyLZ770TB6r6nyzscms%3D',
   'https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/a630bb84828043d9f4a6f209f585d6bb~c5_1080x1080.jpeg?x-expires=1702566000&x-signature=V83mPy5QVmz8CJQXHc0bxwRe8Uk%3D'],
  'width': 720,
  'height': 720,
  'url_prefix': None},
 'avatar_thumb': {'uri': 'tos-maliva-avt-0068/a630bb84828043d9f4a6f209f585d6bb',
  'url_list': ['https://p16-sign-va.tiktokcdn.com/tos-maliva-avt-0068/a630bb84828043d9f4a6f209f585d6bb~c5_100x100.webp?biz_tag=musically_video.video_user_cover&x-expires=1702566000&x-signature=Fnu382PXsspIb5rknOeQoIRcSqg%3D',
   'https://p

In [17]:
# Step 1: Read .txt files and store in a dictionary
folder_path = DATA_PATH / 'TikTok/01_raw/videos_transcribed/'  # Update with the path to your .txt files
text_data = {}
for filename in tqdm(os.listdir(folder_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_data[filename[:-4]] = file.read()  # Remove '.txt' from filename
print('data read in')
assert set(text_data.keys()).issubset(set(consolidate_dict.keys()))

  0%|          | 0/5685 [00:00<?, ?it/s]

data read in


In [19]:
text_df = pd.DataFrame.from_dict(text_data, orient='index', columns=['text'])
text_df.reset_index(inplace=True)
text_df.rename(columns={'index': 'video_id'}, inplace=True)

# Merge with metadata
final_dataset = pd.merge(metadata_df, text_df, on='video_id', how='inner')

# Step 4: Your dataset is now ready for BERTopic or other NLP processing
#print(final_dataset.head())

final_dataset.to_parquet(DATA_PATH / 'TikTok/01_raw/videos_transcribed.parquet')
print('Save complete. Ending.')

Save complete. Ending.


In [21]:
final_dataset

Unnamed: 0,video_id,author_id,description,description_lang,music_album,music_idstr,music_title,music_matchedsong_title,music_matchedsong_idstr,text
0,7312066507876420896,7024542821667718149,,fr,,7312066555717028641,original sound - yes_captain,,,"Depardieu Open & Club, c'est parti, a priori ..."
1,7312030909451062561,6882694756464772101,As tu vu le reportage ? Que penses tu du compo...,fr,Next Up - S4-E2,7062118091128113154,"Next Up - S4-E2, Pt. 1","Next Up - S4-E2, Pt. 1",7062118088925284354,"Isn't it lovely, all alone, hiding me to gras..."
2,7312001731611872544,7230161945558762522,Mon féminisme passera TOUJOURS avant Kev Adams...,fr,racine carrée - speed up,7265354827323279361,Ta fête - speed up,Ta fête - speed up,7265369213435398146,"C'est fini l'heure, fini l'heure de danser. ..."
3,7311967803840433441,7064618369538458630,🚨Droit de réponse🚨 💩🐖🐽🐷🐗 ...,fr,,7079479690389392133,Therapie Taxi Salop e,Salop(e),6761358166102919169,"Alors va t'faire enculer, va bien t'faire enc..."
4,7311831912203570464,7175263675905131526,#gerarddepardieu #balancetonporc #fypシ #pourtoi,un,,7311831925805681440,original sound - lu._seguy04,,,"Quoi ? Qu'on y croit ou pas, il y aura bien u..."
...,...,...,...,...,...,...,...,...,...,...
5680,7246217520884059398,7205563713583170586,Trả lời @phungthichi208 Bán mình chữa bệnh cho...,vi,,7246217589964196614,original sound - tramnam86,,,Rồi trong đầu của cô lại có một câu hỏi khác....
5681,7245952342082866439,7219122142062085126,#吃 #中国美食 #深夜放毒 #美食视频 #妈呀太香了 #Foodie #fyp #tikt...,zh-Hans,,7245952374106327810,original sound - xiaolan501,,,小兔崽子肝挑战五两加辣. 第一次给你打不下去. 没事 还可以再提一遍. 吹粉吹粉. 哇 五两...
5682,7245642107019480328,6995375894593045505,#日本のゆるキャラ #BookTok #youtube #ゆるキャラ #懐かしい子供番組 #...,un,,7245642256831531777,original sound - hiddleston_18,,,ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ ឡ...
5683,7245490436595354906,7227353006106526747,Phần 2 #movie180kva #xuhuong,vi,,7245490589041527578,original sound - afrikan.movie,飞鸟和蝉,6844787196688336897,"Không liên quan gì đến anh, tôi có thể nuôi đ..."
