## Load Transcripts into individual LangChain Docs

In [7]:
from langchain.document_loaders import JSONLoader, DirectoryLoader
import jq
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["video_id"] = record.get('video_id')
    metadata["episode_title"] = record.get("Episode Title")
    metadata["guest"] = record.get("Guest")
    metadata["video_url"] = record.get("URL")
    metadata["date_posted"] = record.get("Date")
    
    return metadata

loader = DirectoryLoader(
    'Transcripts/', 
    glob='*.json',
    loader_cls=JSONLoader,
    loader_kwargs={'jq_schema' : '.[]', 'content_key' : "text", 'metadata_func' : metadata_func},
    show_progress = True   
)

docs = loader.load()

100%|██████████| 1/1 [00:00<00:00, 18.26it/s]


Create the Naive Text Splitter to split at 1000 tokens

In [10]:
import tiktoken 
tokenizer = tiktoken.get_encoding('cl100k_base')

#Create function to check token length
def tiktoken_len(text):
    tokens= tokenizer.encode(
        text,
        disallowed_special = ()
    )
    return len(tokens)

from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size = 1000, chunk_overlap = 0)


Create the System Message for GPT-4 to perform the smart chunking.

In [65]:
def create_system_message(text_type, text_title):
    system_message = f'''Given some text, which is part of a {text_type} from {text_title}, your goal is to split the text in half so that no thought or topic is cutoff and the split is performed at the end of a complete topic.

        You will be given steps to follow until the final desired result is achieved. 

        Some important things to note:
        - '!MIDPOINT!' denotes the midpoint of the text.
        - To complete this task effectively you must adhere to strictly to the instructions at each step
        - You must be exact with your output, whenever providing words from the text, copy exactly what is written, even if there are missing words, repeated words or misspellings, it does not matter.

        Step 1 - Determine what the main topics are before and after the !MIDPOINT! label. Main topics are overall topics to which the text is about, not brief things that are mentioned in passing. Create a list of topics as such:
        Before Midpoint = Topic 1, Topic 2, Topic 3
        After Midpoint = Topic 4, Topic 5, Topic 6

        Step 2 - Based on the the lists of Topic Labels you have created identify if either of these conditions are true. 
        - The last topic of the first section is semantically related to the first topic of the second section
        - The last topic of the first section continues on past the !MIDPOINT!. 

        Step 3 - Depending on the condition that you have identified from Step 2, decide which of the following course of actions must be taken:
        - If there was semantic overlap between the two topics then the words you split on should be located at the conclusion of overlapping topics. When the transition to the next semantically unrelated topic begins.
        - If there is a continuation of the last topic of the first section past the !MIDPOINT! then the text needs to be split where this continuation ends.
        - If neither of these conditions where meet then you must check if the current location of the !MIDPOINT! interrupts the completion of a topic, if this is the case then the split point should be where the interrupted topic concludes. 
        
        
        ​​Step 4- Based on the course of action you have identified in Step 3, perform this course of action and locate a small set of exact words on which to split the text. The words must be exact and should not be long. If there appears to be puncutation present in the text, the location to split should always be after the completion of a sentence.

        Step 5 - Given the exact words on where to split reorganize the topics so that they match the new sections which are determined by the split location.
        For example if in the example from Step 1, Topic 3 and Topic 4 have overlap then the lists would now be as follows:
        Before Split = Topic 1, Topic 2, Topic 3, Topic 4
        After Split = Topic 5, Topic 6

        Final Step - Now that we know where the text should be split and the new organization of topics, provide the final output which is a python dictionary with 3 key-value pairs:
        - "before_split_topics" will be the 'Before Split' list that you identified in Step 3.
        -  "after_split_topics" will be the 'After Split' list that you identified in Step 3.
        - "split_key" will be the exact words that identify where the text should be split.
        An example of what the final output should look like structurally:

        {{"before_split_topics : ["Topic 1", "Topic 2", "Topic 3", "Topic 4",], "after_split_topics" : ["Topic 5", "Topic 6"],"split_key" : "split the text here"}}

        Begin!'''
    
    return system_message

Create the User Message for GPT-4 to perform the smart chunking.

In [36]:
from langchain.prompts import PromptTemplate
def create_user_message(bigtext):
    user_template = PromptTemplate.from_template("TEXT: \n  {bigtext} \n Remember follow the outlined 6 step plan. The location you decide to split on should be near the !MIDPOINT!. If there appears to be puncutation present in the text, the location to split should always be after the completion of a sentence. Write the out the result of each step and then the final output: ")
    return user_template.format(bigtext = bigtext)


The main function that organizes the logic behind the smart chunking and document enrichment

In [60]:
from langchain.schema import SystemMessage, HumanMessage
from langchain.chat_models import ChatOpenAI
import time
import ast

#Takes in the transcript as a singular LangChain Document.
def smart_chunking(doc):
    #First we perform a naive split on the LangChain Doc using a token length of 1000.
    documents = splitter.split_documents([doc])

    #Get the Title of the Text so the LLM has some context
    text_title = documents[0].metadata["episode_title"]

    #Give the Type of Text for context
    text_type = "Transcript" #User should edit it this
    chat = ChatOpenAI(model="gpt-4", temperature=0)

    #Use this for the two chunks
    final_docs = {}
    changed = []
    dicts_of_changes = []

    for i, chunk in enumerate(documents):
        print(f"Index = {i}")
        if i + 1 < len(documents):
            first = documents[i].page_content
            second = documents[i+1].page_content
            full_text = first + ' !MIDPOINT! ' + second
            #Create the individualized User and System Message
            hum_message = create_user_message(full_text)
            sys_message = create_system_message(text_title=text_title,text_type=text_type)

            messages = [
                SystemMessage(content = sys_message),
                HumanMessage(content = hum_message)
                ]
            response = chat(messages)
            output = response.content

            print(output)

            #Process to extract the dictionary
            
            #Extract index of brackets
            open_bracket_index = output.find('{')
            closed_bracket_index = output.find('}') + 1

            #Slice those indicies
            dictionary_string = output[open_bracket_index:closed_bracket_index]

            #Convert the string to dictionary literal
            boundary_dict = ast.literal_eval(dictionary_string)
            dicts_of_changes.append(boundary_dict)

            total_text = first + ' ' + second

            end_index_first_chunk = total_text.find(boundary_dict['split_key'])
            print(end_index_first_chunk)

            new_first_chunk = total_text[0:end_index_first_chunk]
            new_second_chunk = total_text[end_index_first_chunk:]

            changed.append(new_first_chunk)
            changed.append(new_second_chunk)

            #ADD TO THE DICT THAT KEEPS TRACK OF THE DOCS
            final_docs[f'Chunk {i}'] = {
                'Text' : new_first_chunk, 'Topics' : boundary_dict['before_split_topics']
            }
            final_docs[f'Chunk {i + 1}'] = {
                'Text' : new_second_chunk, 'Topics' : boundary_dict['after_split_topics']
            }

            print(f"Old first: \n {first} \n New first: \n {new_first_chunk} \n Old Second: \n {second} \n New Second: \n {new_second_chunk}")

            documents[i].page_content = new_first_chunk
            documents[i].metadata["Topics"] = boundary_dict['before_split_topics']
            documents[i+1].page_content = new_second_chunk
            documents[i+1].metadata["Topics"] = boundary_dict['after_split_topics']
            time.sleep(1)
    return documents

In [None]:
neil_chunks = smart_chunking(docs[0])
final_doc_list = []
for chunk in neil_chunks:
    temp = {'text' : chunk.page_content}
    del chunk.metadata['source']
    temp.update(chunk.metadata)
    final_doc_list.append(temp)

import json
with open(f'LangChain Documents/episode_1904.json','w') as f:
    f.write(json.dumps(final_doc_list))

In [None]:
mark_chunks = smart_chunking(docs[2])
final_doc_list = []
for chunk in mark_chunks:
    temp = {'text' : chunk.page_content}
    del chunk.metadata['source']
    temp.update(chunk.metadata)
    final_doc_list.append(temp)

import json
with open(f'LangChain Documents/episode_1863.json','w') as f:
    f.write(json.dumps(final_doc_list))

In [None]:
elon_chunks = smart_chunking(docs[1])
final_doc_list = []
for chunk in elon_chunks:
    temp = {'text' : chunk.page_content}
    del chunk.metadata['source']
    temp.update(chunk.metadata)
    final_doc_list.append(temp)

import json
with open(f'LangChain Documents/episode_1470.json','w') as f:
    f.write(json.dumps(final_doc_list))