In [1]:
import os

import openai
import whisper

import numpy as np
import pandas as pd
import nltk
import tiktoken

import pickle

from pytube import YouTube

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
model = whisper.load_model('base')
nltk.download('punkt')  # download the NLTK tokenizer

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Download Video

v_url_short = "https://www.youtube.com/watch?v=DuaTGng9tRU"
v_url_long = "https://www.youtube.com/watch?v=sY8aFSY2zv4"
youtube_video = YouTube(v_url) # Select YT video

print(youtube_video.title)

audio_stream_set = youtube_video.streams.filter(only_audio = True)
audio_stream = audio_stream_set.first() # Select quality audio stream

try:
    audio_stream.download(filename = 'test_video.mp4') # Download video
except Exception as e:
    print("An error occured: e")

In [None]:
# Transcribe and save text as csv divided by sentences

path = 'test_video.mp4'
t_model = whisper.transcribe(model= model, audio= 'test_video.mp4', fp16 = False) # Get transcript

In [None]:
# Tokenize and save as csv file
transcript = t_model['text']

# create a Pandas DataFrame with one row for each sentence
trans_df = pd.DataFrame({'content': nltk.sent_tokenize(transcript)})

# add a new column with the length of each sentence
trans_df['title'] = youtube_video.title
trans_df['token'] = trans_df['content'].apply(len)
trans_df = trans_df.reset_index()
trans_df = trans_df[['title', 'index', 'content', 'token']]

# save the DataFrame to a CSV file
trans_df.to_csv('video_text_long.csv', index=False)

# print the DataFrame
trans_df

In [3]:
df = pd.read_csv('video_text_long.csv', header=0, names=["title", "heading", "content", "token"])
print(f"{len(df)} rows in the data.")
df.head(10)

2789 rows in the data.


Unnamed: 0,title,heading,content,token
0,"Jordan Peterson: Life, Death, Power, Fame, and...",0,"battle not with monsters, lest ye become a mo...",52
1,"Jordan Peterson: Life, Death, Power, Fame, and...",1,"And if you gaze into the abyss, the abyss gaze...",62
2,"Jordan Peterson: Life, Death, Power, Fame, and...",2,Right.,6
3,"Jordan Peterson: Life, Death, Power, Fame, and...",3,"But I would say, bring it on.",29
4,"Jordan Peterson: Life, Death, Power, Fame, and...",4,"If you gaze into the abyss long enough, you se...",76
5,"Jordan Peterson: Life, Death, Power, Fame, and...",5,Are you sure about that?,24
6,"Jordan Peterson: Life, Death, Power, Fame, and...",6,I'm betting my life on it.,26
7,"Jordan Peterson: Life, Death, Power, Fame, and...",7,Following is a conversation with Jordan Peters...,170
8,"Jordan Peterson: Life, Death, Power, Fame, and...",8,This is the Lex Friedman podcast to support it.,47
9,"Jordan Peterson: Life, Death, Power, Fame, and...",9,Please check out our sponsors in the description.,49


In [None]:
# Compute the transcript and query embeddings
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

prompt = "Why should you start being selfish?"

def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.content) for idx, r in df.iterrows()
    }

def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [None]:
openai.api_key = "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"
test_emb = get_embedding(prompt, EMBEDDING_MODEL)

*** RESULT ***

{
  "data": [
    {
      "embedding": [
        -0.006929283495992422,
        -0.005336422007530928,
        ...
        -4.547132266452536e-05,
        -0.024047505110502243
      ],
      "index": 0,
      "object": "embedding"
    }
  ],
  "model": "text-embedding-ada-002",
  "object": "list",
  "usage": {
    "prompt_tokens": 5,
    "total_tokens": 5
  }
}