Code adapted from https://github.com/dfreelon/pyktok

In [1]:
import browser_cookie3
from bs4 import BeautifulSoup
from datetime import datetime
import json
import numpy as np
import os
import pandas as pd
import random
import re
import requests
import time
import pyktok as pyk
import csv
from time import sleep
from threading import Thread


from selenium import webdriver
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeiumService #sic
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.core.utils import ChromeType
from webdriver_manager.firefox import GeckoDriverManager

headers = {'Accept-Encoding': 'gzip, deflate, sdch',
           'Accept-Language': 'en-US,en;q=0.8',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
           'Cache-Control': 'max-age=0',
           'Connection': 'keep-alive'}
cookies = browser_cookie3.load()

In [2]:
def get_tiktok_json(video_url,browser_name=None):
    global cookies
    for i in range(10):
        if browser_name is not None:
            cookies = getattr(browser_cookie3,browser_name)(domain_name='tiktok.com')
        tt = requests.get(video_url,
                        headers=headers,
                        cookies=cookies,
                        timeout=60)
        soup = BeautifulSoup(tt.text, "html.parser")
        tt_script = soup.find('script', attrs={'id':"SIGI_STATE"})
        try:
            tt_json = json.loads(tt_script.string)
        except AttributeError:
            print("The function encountered a downstream error and did not deliver any data, which happens periodically (not sure why). Please try again later.")
            continue
        return tt_json


In [3]:
def generate_data_row(video_obj, date, user_id):
    data_header = ['video_id',
                   'video_timestamp',
                   'video_duration',
                   'video_locationcreated',
                   'video_diggcount',
                   'video_sharecount',
                   'video_commentcount',
                   'video_playcount',
                   'video_description',
                   'video_date',
                   'vodeo_user_id',
                   'music_id',
                   'music_title',
                   'author_username',
                   'author_name']
    data_list = []
    data_list.append(video_obj['id'])
    try:
        ctime = video_obj['createTime']
        data_list.append(datetime.fromtimestamp(int(ctime)).isoformat())
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['video']['duration'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['locationCreated'])
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['stats']['diggCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['shareCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['commentCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['stats']['playCount'])
    except Exception:
        data_list.append(np.nan)
    try:
        data_list.append(video_obj['desc'])
    except Exception:
        data_list.append('')
    try:
        data_list.append(date)
    except Exception:
        data_list.append('')
    try:
        data_list.append(user_id)
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['music']['id'])
    except Exception:
        data_list.append('')
    try:
        data_list.append(video_obj['music']['title'])
    except Exception:
        data_list.append('')

    try:
        data_list.append(video_obj['author']['uniqueId'])
    except Exception:
        try:
            data_list.append(video_obj['author'])
        except Exception:
            data_list.append('')
    try:
        data_list.append(video_obj['author']['nickname'])
    except Exception:
        try:
            data_list.append(video_obj['nickname'])
        except Exception:
            data_list.append('')


    data_row = pd.DataFrame(dict(zip(data_header,data_list)),index=[0])
    return data_row


In [4]:
def make_csv_file(url,metadata_fn, date, user_id):
    try:
        tt_json = get_tiktok_json(url)
        data_slot = tt_json['ItemModule'][list(tt_json['ItemModule'].keys())[0]]
        data_row = generate_data_row(data_slot, date, user_id)
    except Exception:
        print(f"{url} link failed")
    try:
        data_row.loc[0,"author_verified"] = tt_json['UserModule']['users'][list(tt_json['UserModule']['users'].keys())[0]]['verified']
    except Exception:
        pass
    try:
        if os.path.exists(metadata_fn):
            metadata = pd.read_csv(metadata_fn,keep_default_na=False)
            combined_data = pd.concat([metadata,data_row])
        else:
            combined_data = data_row
        combined_data.to_csv(metadata_fn,index=False)
        print("Saved metadata for video", url,"to",os.getcwd())
    except Exception:
        pass

In [7]:
df = pd.read_csv("file.csv")

In [None]:
# note that requests from my IP got blocked afte around 60,000 to 80,000 asks

In [None]:
threads = []
number_of_threads = 10
input_data_per_thread = []

for i in range(number_of_threads):
    input_data_per_thread.append([])

iter = 0
for index, row in df.iterrows():
    thread_id = iter % number_of_threads
    input_data_per_thread[thread_id].append(row)
    iter += 1

def threaded_function(thread_id):
    while len(input_data_per_thread[thread_id]) > 0:
        row = input_data_per_thread[thread_id].pop(0)
        url = row["Link"]
        date = row["Date"]
        user_id = row["ID"] 
        make_csv_file(url, "tiktok_browse_thread_output_" + str(thread_id) + ".csv", date, user_id)
        print (f"Thread {thread_id} has {len(input_data_per_thread[thread_id])} left")

if __name__ == "__main__":
    for i in range(number_of_threads):
        thread = Thread(target = threaded_function, args = (i, ))
        thread.start()
        threads.append(thread)

    for i in range(number_of_threads):
        threads[i].join()

    