### **Using OpenAI Whisper**
#### Transcribing TikTok videos

##### *Author: @Jyontika Kapoor*

In [2]:
from openai import OpenAI
import requests
from docx import Document
import os
import whisper
import pandas as pd
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import json 
import librosa
cwd = os.getcwd()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


### Pre-Processing CSVs

In [3]:
### Merging Audrey and Tayae's videos

poster = pd.read_csv("/Users/jyontika/Documents/GitHub/CS315-Project-2/analysis/news_accs/news_by_poster.csv")

hashtag = pd.read_csv("/Users/jyontika/Documents/GitHub/CS315-Project-2/analysis/hashtag_initial/news_by_hashtag.csv")

nyt = pd.read_csv("/Users/jyontika/Documents/GitHub/CS315-Project-2/analysis/top_cosine_similarities.csv")


# Merge the DataFrames and keep only the 'video_id' column
merged_df = pd.concat([poster['video_id'], hashtag['video_id'], nyt['video_id']], ignore_index=True)

# remove duplicates
merged_df = merged_df.drop_duplicates()

# shuffle the indices randomly
merged_df = merged_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(merged_df)



0      7288055765938670894
1      7302611720214957358
2      7301777464840277294
3      7289551496524467502
4      7289467129089461550
              ...         
470    7305077915966852358
471    7305541674766241054
472    7290970440023969057
473    7295843462497226015
474    7303352147972721962
Name: video_id, Length: 475, dtype: int64


In [4]:
poster

Unnamed: 0,video_id,author_username,video_description,hashtags,suggested_words,file_name
0,6988185257426128133,gma,Morgan Wallen is speaking out to address using...,"['news', 'morganwallen']","morgan wallen racist video, morgan wallen, mor...",Sec2Gr3_77777.csv
1,7283179469001461035,nbcnews,"After wrapping ""#Scandal,"" #KerryWashington's ...","['scandal,""', ""kerrywashington's"", 'dna']","shera kerry washington, kerry washington somal...",Sec2Gr3_77217.csv
2,6948100100992322821,cbsnews,A baby kangaroo is rescued from its mother’s p...,"['news', 'australia']","kangaroo pouch, Kangaroo, baby kangaroo, Anima...",Sec2Gr3_77217.csv
3,7307673039922105643,wired,"Elmo is here to set the record straight, once ...","['elmo.', 'sesamestreet', 'newyorker', 'newyor...","elmo, elmo funniest moments, elmo balsamicvine...",Sec2Gr3_77217.csv
4,7306609553305652510,nbcnews,Former President #JimmyCarter and former first...,"['jimmycarter', 'rosalynncarter']",,Sec2Gr3_77217.csv
...,...,...,...,...,...,...
81,7288868567003614510,todayshow,#taylorswift has arrived at the #tserastourmov...,"['taylorswift', 'tserastourmovie']",,Sec2Gr3_74721.csv
82,7267091322559843616,bbcnews,One Hawaii resident says some tourists are car...,"['lahaina', 'maui', 'hawaii', 'hawaiifire', 'w...","hawaii tourists, maui tourists, hawaii, Lahain...",Sec2Gr3_74721.csv
83,7287837532161625377,middleeasteye,Palestinian ambassador to the UK blasts the BB...,"['bbc', 'israel', 'palestinian', 'ambassador',...",,Sec2Gr3_74721.csv
84,7285777286316510470,ctvnews,If you’re stuck in traffic on Hwy. 400 Tuesday...,"['breakingnews', 'truck', 'highway', 'ontario'...",,Sec2Gr3_74721.csv


In [16]:
##Change the video_ids to URLs

urls = ["https://www.tiktokv.com/share/video/" + str(video_id) + "/" for video_id in merged_df]

# convert  list of URLs to a DataFrame
urls_df = pd.DataFrame(urls, columns=['Link'])

print(urls_df)

                                                  Link
0    https://www.tiktokv.com/share/video/7288055765...
1    https://www.tiktokv.com/share/video/7302611720...
2    https://www.tiktokv.com/share/video/7301777464...
3    https://www.tiktokv.com/share/video/7289551496...
4    https://www.tiktokv.com/share/video/7289467129...
..                                                 ...
470  https://www.tiktokv.com/share/video/7305077915...
471  https://www.tiktokv.com/share/video/7305541674...
472  https://www.tiktokv.com/share/video/7290970440...
473  https://www.tiktokv.com/share/video/7295843462...
474  https://www.tiktokv.com/share/video/7303352147...

[475 rows x 1 columns]


In [17]:
##Randomly choose 300 and turn to dataframe
sampled_urls_df = urls_df.sample(n=300, random_state=42)


json_data = sampled_urls_df.to_dict(orient='records')

# Export the JSON data to a file
with open("sampled_tiktok_urls.json", "w") as json_file:
    json.dump(json_data, json_file, indent=4)




In [21]:
downloaded_vids = pd.read_csv("videos-downloaded.csv")
downloaded_vids.shape

(296, 20)

In [29]:
video_ids = downloaded_vids['video_id']
video_ids

0      7290493831408045330
1      7295925652992363822
2      7175869072263990533
3      7304461225448967467
4      7298442585964662046
              ...         
291    7300191041280412933
292    7293283723934453038
293    7297044924862647595
294    7287549870171245854
295    7291460094032645418
Name: video_id, Length: 296, dtype: int64

### Transcription

In [18]:
#client = OpenAI(api_key=XYZ)

!pip install git+https://github.com/openai/whisper.git

In [33]:

# Load the WhisperProcessor and WhisperForConditionalGeneration models
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

# set forced_decoder_ids to None for unforced context tokens
model.config.forced_decoder_ids = None  



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [37]:
output_path = "txt-transcripts/"

# define sampling rate
sampling_rate = 16000  

for video_id in video_ids:
    audio_path = f"videos/share_video_{video_id}_.mp4"


    if os.path.exists(audio_path):

        # load the audio file using librosa
        audio, sr = librosa.load(audio_path, sr=sampling_rate)

        # process the audio using WhisperProcessor
        input_features = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_features

        # generate token ids for transcription
        predicted_ids = model.generate(input_features)

        # decode token ids to text
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

        # write transcription to a text file named after the video ID
        with open(os.path.join(output_path, f"{video_id}.txt"), "w", encoding="utf-8") as txt:
            txt.write(transcription)

    else:
        # If audio file not found
        print(f"Audio for video ID {video_id} not found.")


  audio, sr = librosa.load(audio_path, sr=sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  audio, sr = librosa.load(audio_path, sr=sampling_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Audio for video ID 7298062755058240801 not found.
Audio for video ID 7289462001636576545 not found.
Audio for video ID 7305077915966852358 not found.
Audio for video ID 7287429952792841504 not found.
Audio for video ID 7289223591458131231 not found.
Audio for video ID 7300367621764091182 not found.
Audio for video ID 7292143951392492846 not found.
Audio for video ID 7304509572847521070 not found.
Audio for video ID 7289551496524467502 not found.
Audio for video ID 7267165450637790482 not found.
Audio for video ID 7290970440023969057 not found.
Audio for video ID 7294334382150651168 not found.
Audio for video ID 7291766284549786926 not found.
Audio for video ID 7288487297316572421 not found.
Audio for video ID 7293537949860367658 not found.
Audio for video ID 7290224386152484101 not found.
Audio for video ID 7287680374539423018 not found.
Audio for video ID 7288531043681062149 not found.
Audio for video ID 7290565797792222510 not found.
Audio for video ID 7290322507800890670 not found.
