In [None]:
import os
import sys

import sys

import openai
import whisper

import numpy as np
import pandas as pd
import nltk
import tiktoken

import pickle
import time
from tqdm import tqdm

from pytube import YouTube

# Replace this with the location of your lib path
sys.path.append('/home/ubuntu/CS5224-Project/lib/')
from gpt3summarizer import GPT3Summarizer

In [None]:
model = whisper.load_model('base')
nltk.download('punkt')  # download the NLTK tokenizer

## Data Crawling and Preprocessing

INFO: I have also added the sample document embedding for long form video (doc_embeddings_long.pickle) - so you dont have to re-download and re-transcribe the videos from scratch

### Download and Transcribe Video

In [None]:
# Download Video

v_url_short = "https://www.youtube.com/watch?v=DuaTGng9tRU"
v_url_long = "https://www.youtube.com/watch?v=sY8aFSY2zv4"
youtube_video = YouTube(v_url) # Select YT video

print(youtube_video.title)

audio_stream_set = youtube_video.streams.filter(only_audio = True)
audio_stream = audio_stream_set.first() # Select quality audio stream

try:
    audio_stream.download(filename = 'test_video.mp4') # Download video
except Exception as e:
    print("An error occured: e")

In [None]:
# Transcribe and save text as csv divided by sentences
path = 'test_video.mp4'
t_model = whisper.transcribe(model= model, audio= 'test_video.mp4', fp16 = False) # Get transcript

In [None]:
# Tokenize and save as csv file
transcript = t_model['text']

# create a Pandas DataFrame with one row for each sentence
trans_df = pd.DataFrame({'content': nltk.sent_tokenize(transcript)})

# add a new column with the length of each sentence
trans_df['title'] = youtube_video.title
trans_df['token'] = trans_df['content'].apply(len)
trans_df = trans_df.reset_index()
trans_df = trans_df[['title', 'index', 'content', 'token']]

# save the DataFrame to a CSV file
trans_df.to_csv('video_text_long.csv', index=False)

# print the DataFrame
trans_df

### Read Transcribed CSV

In [None]:
df = pd.read_csv('video_text_long.csv', header=0, names=["title", "heading", "content", "token"])
print(f"{len(df)} rows in the data.")
df.head(5)

## Fine Tuning GPT3

### Setup OpenAI API

In [None]:
OPENAI_API_KEY = ""

openai.api_key = OPENAI_API_KEY
EMBEDDING_MODEL = "text-embedding-ada-002"

In [None]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    if text is not None:
        try:
            result = openai.Embedding.create(
              model=model,
              input=text
            )
            return result["data"][0]["embedding"]
        except:
            return "Rate Limit Reached"
    else:
        return None

### Calculate Embedding

In [None]:
def compute_doc_embeddings(df: pd.DataFrame, label:str):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    embeddings_dict = {}
    
    for idx, r in tqdm(df.iterrows()):
        embedding = get_embedding(r[label])
        
        # Delay 60s if rate limit reached
        if type(embedding) == str:
            time.sleep(60)
            embedding = get_embedding(r[label])
        
        embeddings_dict[idx] = embedding
    
    return embeddings_dict

In [None]:
document_embeddings = compute_doc_embeddings(df, label="content")

In [None]:
# Save embeddings as pickle
with open('doc_embeddings_long.pickle', 'wb') as f:
    pickle.dump(document_embeddings, f, protocol= pickle.HIGHEST_PROTOCOL)

### Finding Similar Documents Using Embedding

In [None]:
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    if x is not None and y is not None:
        return np.dot(np.array(x), np.array(y))
    else:
        return -1

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [None]:
order_document_sections_by_query_similarity("How did Jordan Peterson feel about Trudeau?", document_embeddings)

### Construct Prompt

In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, label:str, 
                     MAX_SECTION_LEN = 1000, SEPARATOR = "\n* ", ENCODING = "gpt2", debug=False):
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
    
    encoding = tiktoken.get_encoding(ENCODING)
    separator_len = len(encoding.encode(SEPARATOR))
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index][label]
        
        if document_section is None:
            continue

        chosen_sections_len += num_tokens_from_string(document_section, ENCODING) + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    if debug:
        print(f"Selected {len(chosen_sections)} document sections:")
        print("\n".join(chosen_sections_indexes))
    
    return chosen_sections

In [None]:
# Compute the transcript and query embeddings
question_prompt = "What does the person think of religion?"
max_sentences = 10

context = construct_prompt(question_prompt, document_embeddings, df, label="content", MAX_SECTION_LEN = 1000)

prompt = f"Instructions:\nAnswer the question as truthfully as possible using the conversation snippets of a podcast. "
prompt += f"If the answer is not contained within the text below, say 'I dont know.'. "
prompt += f"Limit your answer to a paragraph of {max_sentences} sentences. "
prompt += f"\n\nContext: {''.join(context)}"
prompt += f"\n\nQuestion: {question_prompt}"
prompt += f"\n\nAnswer: "

print(prompt)

### Answer Prompt

In [None]:
response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                        {"role": "system", "content": "You are an AI assistant that summarizes podcasts"},
                        {"role": "user", "content": prompt},
                    ]
            )
choices = response.choices
if choices:
    content = choices[0].message.content
    total_tokens = response.usage.total_tokens
else:
    print(response)
    Exception("No choices returned from GPT-3 API using model 'gpt-3.5-turbo'")

In [None]:
print(content)