In [62]:
import os
import sys

import sys

import openai
import whisper

import numpy as np
import pandas as pd
import nltk
import tiktoken

import pickle
import time
from tqdm import tqdm

from pytube import YouTube

# Replace this with the location of your lib path
sys.path.append('/home/ubuntu/CS5224-Project/lib/')
from gpt3summarizer import GPT3Summarizer

In [2]:
model = whisper.load_model('base')
nltk.download('punkt')  # download the NLTK tokenizer

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Data Crawling and Preprocessing

INFO: I have also added the sample document embedding for long form video (doc_embeddings_long.pickle) - so you dont have to re-download and re-transcribe the videos from scratch

### Download and Transcribe Video

In [None]:
# Download Video

v_url_short = "https://www.youtube.com/watch?v=DuaTGng9tRU"
v_url_long = "https://www.youtube.com/watch?v=sY8aFSY2zv4"
youtube_video = YouTube(v_url) # Select YT video

print(youtube_video.title)

audio_stream_set = youtube_video.streams.filter(only_audio = True)
audio_stream = audio_stream_set.first() # Select quality audio stream

try:
    audio_stream.download(filename = 'test_video.mp4') # Download video
except Exception as e:
    print("An error occured: e")

In [None]:
# Transcribe and save text as csv divided by sentences
path = 'test_video.mp4'
t_model = whisper.transcribe(model= model, audio= 'test_video.mp4', fp16 = False) # Get transcript

In [None]:
# Tokenize and save as csv file
transcript = t_model['text']

# create a Pandas DataFrame with one row for each sentence
trans_df = pd.DataFrame({'content': nltk.sent_tokenize(transcript)})

# add a new column with the length of each sentence
trans_df['title'] = youtube_video.title
trans_df['token'] = trans_df['content'].apply(len)
trans_df = trans_df.reset_index()
trans_df = trans_df[['title', 'index', 'content', 'token']]

# save the DataFrame to a CSV file
trans_df.to_csv('video_text_long.csv', index=False)

# print the DataFrame
trans_df

### Read Transcribed CSV

In [7]:
df = pd.read_csv('video_text_long.csv', header=0, names=["title", "heading", "content", "token"])
print(f"{len(df)} rows in the data.")
df.head(5)

2789 rows in the data.


Unnamed: 0,title,heading,content,token
0,"Jordan Peterson: Life, Death, Power, Fame, and...",0,"battle not with monsters, lest ye become a mo...",52
1,"Jordan Peterson: Life, Death, Power, Fame, and...",1,"And if you gaze into the abyss, the abyss gaze...",62
2,"Jordan Peterson: Life, Death, Power, Fame, and...",2,Right.,6
3,"Jordan Peterson: Life, Death, Power, Fame, and...",3,"But I would say, bring it on.",29
4,"Jordan Peterson: Life, Death, Power, Fame, and...",4,"If you gaze into the abyss long enough, you se...",76


## Fine Tuning GPT3

### Setup OpenAI API

In [27]:
OPENAI_API_KEY = ""

openai.api_key = OPENAI_API_KEY
EMBEDDING_MODEL = "text-embedding-ada-002"

In [10]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    if text is not None:
        try:
            result = openai.Embedding.create(
              model=model,
              input=text
            )
            return result["data"][0]["embedding"]
        except:
            return "Rate Limit Reached"
    else:
        return None

### Calculate Embedding

In [23]:
def compute_doc_embeddings(df: pd.DataFrame, label:str):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    embeddings_dict = {}
    
    for idx, r in tqdm(df.iterrows()):
        embedding = get_embedding(r[label])
        
        # Delay 60s if rate limit reached
        if type(embedding) == str:
            time.sleep(60)
            embedding = get_embedding(r[label])
        
        embeddings_dict[idx] = embedding
    
    return embeddings_dict

In [24]:
document_embeddings = compute_doc_embeddings(df, label="content")

2789it [08:20,  5.57it/s]


In [25]:
# Save embeddings as pickle
with open('doc_embeddings_long.pickle', 'wb') as f:
    pickle.dump(document_embeddings, f, protocol= pickle.HIGHEST_PROTOCOL)

### Finding Similar Documents Using Embedding

In [32]:
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    if x is not None and y is not None:
        return np.dot(np.array(x), np.array(y))
    else:
        return -1

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [33]:
order_document_sections_by_query_similarity("How did Jordan Peterson feel about Trudeau?", document_embeddings)

[(0.8672889335434995, 1407),
 (0.8570718092777034, 2468),
 (0.8564087270322823, 10),
 (0.8546067956623602, 2784),
 (0.8422289730884449, 2722),
 (0.8390220212961363, 430),
 (0.8386862270058134, 7),
 (0.8288249801052165, 2693),
 (0.8261779929300056, 1071),
 (0.8247258249099796, 1108),
 (0.8228813576605243, 971),
 (0.8189923755620091, 1762),
 (0.8159812624847416, 1002),
 (0.8143890026828391, 995),
 (0.8139186286673459, 1000),
 (0.8137583291342703, 1423),
 (0.8090831177981771, 1081),
 (0.8087218969099059, 2367),
 (0.8072353733985596, 998),
 (0.8004164941764644, 2240),
 (0.8000474706470659, 2778),
 (0.797280488430997, 431),
 (0.796654845964748, 1089),
 (0.7964491718909308, 1277),
 (0.791293151473405, 1082),
 (0.7911229642062002, 1005),
 (0.7878200176399877, 1200),
 (0.7862886309823908, 1083),
 (0.7861274501585881, 2237),
 (0.7860184463575308, 2663),
 (0.7833060629027542, 1239),
 (0.7829754404910987, 1605),
 (0.782630120507937, 721),
 (0.7819075073229544, 1216),
 (0.7817153216807223, 1075),


### Construct Prompt

In [46]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame, label:str, 
                     MAX_SECTION_LEN = 1000, SEPARATOR = "\n* ", ENCODING = "gpt2", debug=False):
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
    
    encoding = tiktoken.get_encoding(ENCODING)
    separator_len = len(encoding.encode(SEPARATOR))
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index][label]
        
        if document_section is None:
            continue

        chosen_sections_len += num_tokens_from_string(document_section, ENCODING) + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    if debug:
        print(f"Selected {len(chosen_sections)} document sections:")
        print("\n".join(chosen_sections_indexes))
    
    return chosen_sections

In [59]:
# Compute the transcript and query embeddings
question_prompt = "What does the person think of religion?"
max_sentences = 10

context = construct_prompt(question_prompt, document_embeddings, df, label="content", MAX_SECTION_LEN = 1000)

prompt = f"Instructions:\nAnswer the question as truthfully as possible using the conversation snippets of a podcast. "
prompt += f"If the answer is not contained within the text below, say 'I dont know.'. "
prompt += f"Limit your answer to a paragraph of {max_sentences} sentences. "
prompt += f"\n\nContext: {''.join(context)}"
prompt += f"\n\nQuestion: {question_prompt}"
prompt += f"\n\nAnswer: "

print(prompt)

Instructions:
Answer the question as truthfully as possible using the conversation snippets of a podcast. If the answer is not contained within the text below, say 'I dont know.'. Limit your answer to a paragraph of 10 sentences. 

Context: 
* What's your religion?
* What's the faith?
* Well, what makes you think you're secular then?
* And you know, a secular person will say, well, I don't believe in God.
* I'm secular.
* Do you believe in a transcendent reality?
* What do you think is the meaning of it all?
* Well, it posits a transcendent reality.
* This is the belief.
* I do believe that he thinks of himself as a ballwork for christened them against the degeneration of the West.
* So the reason I was saying that while we were talking about belief in God, it's like, this is a very complicated topic, right?
* Even secular people go to museums?
* Well, I guess it's the divine that saves the world.
* He's favored of God.
* Who is God then?
* You want to know what I think?
* Do you belie

### Answer Prompt

In [60]:
response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                        {"role": "system", "content": "You are an AI assistant that summarizes podcasts"},
                        {"role": "user", "content": prompt},
                    ]
            )
choices = response.choices
if choices:
    content = choices[0].message.content
    total_tokens = response.usage.total_tokens
else:
    print(response)
    Exception("No choices returned from GPT-3 API using model 'gpt-3.5-turbo'")

In [61]:
print(content)

It's a complicated topic and the person thinks deeply about it. They have discussed belief in God and the transcendent reality, as well as the meaning of it all. While they consider themselves secular, they acknowledge the possibility of a transcendent reality and think that some religious images and ideas have a metaphorical truth. They have a sense that religious systems have become propositionally dubious, but they also see the importance of making a fundamental statement of faith. They think that an ethic is embedded in religion and that it can provide a sense of adventure, which they see as lacking in a cynical or escapist approach to life. Overall, the person seems to have a nuanced and open-minded view of religion as a multifaceted and complex phenomenon that is difficult to fully grasp or simplify.
