# Transcribing TikTok videos using OpenAI Whisper

**Author**: Catherine Foster \
**Date**: 4/22/24

In [None]:
import pyktok as pyk
import logging
import time
import openai
from openai import OpenAI
from openai import APIStatusError
import whisper
import torch
import os
import numpy
import re
import pandas as pd

In [None]:
import csv

#### Read in CSV file with distributed links and corresponding recommended links

In [None]:
recs_df = pd.read_csv('filtered_merged_recommended.csv')
recs_df.head()

#### Extract all distributed and recommended URLs 

In [None]:
urls = []

for index, row in recs_df.iterrows():
    row_urls = [row['distributed_link'], row['rec_1'], row['rec_2'], row['rec_3'],
               row['rec_4'], row['rec_5'], row['rec_6'], row['rec_7'], row['rec_8']]
    for url in row_urls:
        if url not in urls:
            urls.append(url)

In [None]:
len(urls)

#### Download videos as mp4 files using Pyktok

In [None]:
#Download video files from TikTok links
logger = logging.getLogger()
file_handler = logging.FileHandler('download_failures.log')
logger.addHandler(file_handler)

pyk.specify_browser('chrome')
for url in urls:
    try:
        pyk.save_tiktok(url)
        time.sleep(3)
    except Exception as ex:
        logger.warning("error while processing item: %s", ex)

#### Transcribe videos using OpenAI Whisper

In [None]:
# Insert your OpenAI API key below
client = OpenAI(api_key='')

#transcript_list = []

with open('transcriptions.csv', 'w', newline='') as out:
    csvwriter = csv.writer(out)
    for filename in os.listdir('videos'):
        mp4file = open(f"videos/{filename}", "rb") 
        try: 
            transcription = client.audio.transcriptions.create(
                model="whisper-1", 
                file=mp4file, 
                response_format="text"
            )
        except APIStatusError: #if video is too long, skip
            transcription = ''
        try:
            vid_id = (re.findall(r'video_(\d+)\.mp4', mp4file.name))[0]
        except IndexError:
            vid_id = filename
        csvwriter.writerow([vid_id, transcription])
        #transcript_list.append([vid_id, transcription])
    
#transcriptions = pd.DataFrame(transcript_list, columns = ['video_id', 'transcription'])
#transcriptions.to_csv('transcriptions.csv')