In [11]:
# %pip install yt-dlp
# %pip install moviepy
# %pip install transformers
# %pip install optimum 
# %pip install accelerate
# %pip install whisperplus
%pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import glob

def get_user_video_urls(username):
    video_urls = []
    for file in glob.glob(f"data/{username}/videos/*.json"):
        with open(file) as f:
            data = json.load(f)
            url = f'https://www.tiktok.com/@{data["author"]["uniqueId"]}/video/{data["id"]}'
            video_urls.append(url)
    return video_urls

In [2]:
import os
from yt_dlp import YoutubeDL
from moviepy.editor import VideoFileClip


def download_video(url, output_folder='data/spdbt/videos'):
    """
    Downloads a video from TikTok and saves it in the specified folder.

    :param url: TikTok video URL.
    :param output_folder: Folder where the video will be saved.
    :return: Path to the downloaded video.
    """

    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    video_id = url.split("/")[-1]

    ydl_opts = {
        'format': 'best',  # Download the best available quality
        'outtmpl': os.path.join(output_folder, f'{video_id}.%(ext)s'),  # Save video with its ID
        # 'ffmpeg_location': '/opt/homebrew/bin/ffmpeg',
    }

    try:
        with YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        print(f"Video downloaded and saved in the '{output_folder}' folder.")
        return os.path.join(output_folder, f'{video_id}.mp4')
    except Exception as e:
        print(f"An error occurred: {e}")



def extract_audio(video_file_path, bitrate="64k"):
    """
    Extracts audio from a video file and saves it as an MP3 file in the same directory.
    
    :param video_file_path: Path to the video file.
    :param bitrate: Bitrate for the audio file, lower for smaller size.
    """
    try:
        # Load the video file
        video = VideoFileClip(video_file_path)

        # Build the output file path
        output_file_path = os.path.splitext(video_file_path)[0] + ".mp3"

        # Extract and write the audio
        video.audio.write_audiofile(output_file_path, codec='libmp3lame', bitrate=bitrate)

        print(f"Audio extracted successfully and saved to {output_file_path}")
        return output_file_path
    except Exception as e:
        print(f"An error occurred: {e}")



# Example usage
# tiktok_url = 'https://www.tiktok.com/@spdbt/video/7047893921757809926'
# path = download_video(tiktok_url)
# audio_path = extract_audio(path)



In [None]:
import glob
import os
import threading
from queue import Queue

def worker(video_queue, username):
    while not video_queue.empty():
        url = video_queue.get()
        download_and_process_video(url, username)
        video_queue.task_done()

def download_and_process_video(url, username):
    path = download_video(url, output_folder=f"data/{username}/videos")
    audio_path = extract_audio(path)
    # transcript = extract_transcript(audio_path)
    # store_transcript(transcript, path.replace(".mp4", ".txt"))

chunk_size = 10  # Number of videos to process in each chunk

for username in os.listdir("data"):
    if username == ".DS_Store":
        continue

    video_urls = get_user_video_urls(username)
    print(f"Found {len(video_urls)} videos for user {username}")

    video_ids = [url.split("/")[-1] for url in video_urls]
    downloaded_videos = glob.glob(f"data/{username}/videos/*.mp3")
    downloaded_video_ids = [os.path.basename(path).replace(".mp3", "") for path in downloaded_videos]
    video_urls = [url for url in video_urls if url.split("/")[-1] not in downloaded_video_ids]
    print(f"Downloading {len(video_urls)} videos for user {username}")

    for i in range(0, len(video_urls), chunk_size):
        video_queue = Queue()
        threads = []

        for url in video_urls[i:i + chunk_size]:
            video_queue.put(url)

        for _ in range(min(chunk_size, video_queue.qsize())):
            thread = threading.Thread(target=worker, args=(video_queue, username))
            thread.start()
            threads.append(thread)

        for thread in threads:
            thread.join()
        video_queue.join()


In [3]:
# count number of mp3 files in folder

import glob
import os


path = 'data/*/*/'

json_files = glob.glob(os.path.join(path, '*.json'))
mp4_files = glob.glob(os.path.join(path, '*.mp4'))
mp3_files = glob.glob(os.path.join(path, '*.mp3'))
txt_files = glob.glob(os.path.join(path, '*.txt'))
print(len(json_files))
print(len(mp4_files))
print(len(mp3_files))
print(len(txt_files))

1634
1618
1624
1623


In [17]:
# plot the diggcount of the videos for each username over time
# show diggcount in 1000s
# plot all usernames in one plot

import json
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

usernames = os.listdir("data")

usernames.remove(".DS_Store")

def get_diggCounts(username):
    diggcounts = []
    dates = []
    for file in glob.glob(f"data/{username}/videos/*.json"):
        with open(file) as f:
            data = json.load(f)
            diggcount = data["stats"]["diggCount"]
            diggcounts.append(diggcount)
            date = data["createTime"]
            dates.append(date)
    return diggcounts, dates

map_users = {}

for username in usernames:
    diggcounts, dates = get_diggCounts(username)
    map_users[username] = {}
    map_users[username]["diggcounts"] = diggcounts
    map_users[username]["dates"] = dates

# safe to json
import json

with open('map_users.json', 'w') as fp:
    json.dump(map_users, fp)




In [4]:
# store all infos in one json
import json
import glob
import os


usernames = os.listdir("data")

usernames.remove(".DS_Store")

video_list = []

for username in usernames:
    for file in glob.glob(f"data/{username}/videos/*.json"):
        with open(file) as f:
            data = json.load(f)
            new_data = {}
            new_data["id"] = data["id"]
            new_data["createTime"] = data["createTime"]
            new_data["diggCount"] = data["stats"]["diggCount"]
            new_data["shareCount"] = data["stats"]["shareCount"]
            new_data["playCount"] = data["stats"]["playCount"]
            new_data["commentCount"] = data["stats"]["commentCount"]
            new_data["author"] = username
            new_data["duration"] = data["video"]["duration"]

            video_list.append(new_data)

# safe to json
import json

with open('list_videos.json', 'w') as fp:
    json.dump(video_list, fp)
    


In [6]:
# analyze the correlation between the diggcount and the other features
# plot the correlation matrix
# plot the correlation matrix for each username

import json
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime

# load the data
with open('list_videos.json') as f:
    data = json.load(f)


# Calculate the correlation matrix
corr = df.corr()

# Plot the correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)
plt.show()
plt.show()

KeyError: 'diggcount'