In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# This is the YouTube video we're going to use.
YOUTUBE_VIDEO = "https://www.youtube.com/watch?v=cdiD-9MMpb0"

## Setting up the model


In [2]:
from langchain_openai.chat_models import ChatOpenAI

model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")

Test the model by asking a simple question.

In [3]:
model.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

AIMessage(content='The Los Angeles Dodgers won the World Series during the COVID-19 pandemic in 2020. They defeated the Tampa Bay Rays in six games to capture their first championship since 1988.', response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 21, 'total_tokens': 59}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-4dc62e12-630e-467b-bdf4-1aa33e86c20f-0')

In [4]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

'The Los Angeles Dodgers won the World Series during the COVID-19 pandemic. They defeated the Tampa Bay Rays in the 2020 World Series to win their first championship since 1988.'

In [5]:
from langchain.prompts import ChatPromptTemplate


template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

In [6]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

'Susana'

## Combining chains

We can combine different chains to create more complex workflows. For example, let's create a second chain that translates the answer from the first chain into a different language.

Let's start by creating a new prompt template for the translation chain:

In [7]:
translation_prompt = ChatPromptTemplate.from_template(
    "Translate {answer} to {language}"
)

In [8]:
from operator import itemgetter

translation_chain = (
    {"answer": chain, "language": itemgetter("language")} | translation_prompt | model | parser
)

translation_chain.invoke(
    {
        "context": "Mary's sister is Susana. She doesn't have any more siblings.",
        "question": "How many sisters does Mary have?",
        "language": "Spanish",
    }
)

'María tiene una hermana.'

In [13]:

import PyPDF2

def pdf_to_text(pdf_path, txt_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as pdf_file:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfReader(pdf_file)

        # Open the text file in write mode
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            # Iterate over each page
            for page in range(len(pdf_reader.pages)):
                # Extract the text from the page
                text = pdf_reader.pages[page].extract_text()
                
                # Write the text to the file
                txt_file.write(text)

    print("PDF to text conversion completed.")

# Specify the paths of the PDF file and the output text file
pdf_file_path = '1.pdf'
text_file_path = '1.txt'

# Call the function to convert PDF to text
pdf_to_text(pdf_file_path, text_file_path)


PDF to text conversion completed.


In [14]:
with open("1.txt") as file:
    transcription = file.read()

transcription[:100]

'CANCER IN VICTORIA 2022  \nVICTORIAN CANCER REGISTRY2 3© Cancer Council Victoria 2023\nCover photo: Dr'

## Splitting the transcription



In [15]:
from langchain_community.document_loaders import TextLoader

loader = TextLoader("1.txt")
text_documents = loader.load()
text_documents

[Document(page_content='CANCER IN VICTORIA 2022  \nVICTORIAN CANCER REGISTRY2 3© Cancer Council Victoria 2023\nCover photo: Dr Omer Gilan, Laboratory Head at Monash University and researcher at The Australian \nCentre for Blood Diseases. \nSuggested citation:  \nVictorian Cancer Registry. Cancer in Victoria, 2022. Cancer Council Victoria, 2023.\nPublished by Cancer Council Victoria, Level 8, 200 Victoria Parade, East Melbourne, 3002.\nFor enquiries please contact vcr@cancervic.org.au \nFor more detailed data, including access to the VCR Data Explorer  \nand downloadable data, please visit our website:  \nwww.cancervic.org.au/research/vcr\nAcknowledgment of Country  \nWe acknowledge the Traditional Custodians of the land and water ways on which we work and live,  \nand pay our respects to the Elders past and present and those emerging.\nThe Victorian Cancer Registry expresses appreciation to the Victorian Aboriginal Community Controlled Health \nOrganisation Inc (VACCHO) for their valua

There are many different ways to split a document. For this example, we'll use a simple splitter that splits the document into chunks of a fixed size. Check [Text Splitters](https://python.langchain.com/docs/modules/data_connection/document_transformers/) for more information about different approaches to splitting documents.

For illustration purposes, let's split the transcription into chunks of 100 characters with an overlap of 20 characters and display the first few chunks:

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
text_splitter.split_documents(text_documents)[:5]

[Document(page_content='CANCER IN VICTORIA 2022  \nVICTORIAN CANCER REGISTRY2 3© Cancer Council Victoria 2023', metadata={'source': '1.txt'}),
 Document(page_content='Cover photo: Dr Omer Gilan, Laboratory Head at Monash University and researcher at The Australian', metadata={'source': '1.txt'}),
 Document(page_content='Centre for Blood Diseases. \nSuggested citation:', metadata={'source': '1.txt'}),
 Document(page_content='Victorian Cancer Registry. Cancer in Victoria, 2022. Cancer Council Victoria, 2023.', metadata={'source': '1.txt'}),
 Document(page_content='Published by Cancer Council Victoria, Level 8, 200 Victoria Parade, East Melbourne, 3002.', metadata={'source': '1.txt'})]

For our specific application, let's use 1000 characters instead:

In [19]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
documents = text_splitter.split_documents(text_documents)

## Finding the relevant chunks



In [20]:
from langchain_openai.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
embedded_query = embeddings.embed_query("Who is Mary's sister?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[-0.001371190081765891, -0.03434698236453119, -0.011476094990116788, 0.0012773800454156574, -0.026166747008526288, 0.009230907949392044, -0.015660022937300136, 0.0017948988196774898, -0.011851335135517721, -0.03324627818637449]


To illustrate how embeddings work, let's first generate the embeddings for two different sentences:

In [21]:
sentence1 = embeddings.embed_query("Mary's sister is Susana")
sentence2 = embeddings.embed_query("Pedro's mother is a teacher")

compute the similarity between the query and each of the two sentences. 

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9174548954382715, 0.7680495517171395)

## Loading transcription into the vector store

We initialized the vector store with a few random strings. Let's create a new vector store using the chunks from the video transcription.

In [23]:
from langchain_community.vectorstores import DocArrayInMemorySearch
vectorstore2 = DocArrayInMemorySearch.from_documents(documents, embeddings)

Let's set up a new chain using the correct vector store. This time we are using a different equivalent syntax to specify the [`RunnableParallel`](https://python.langchain.com/docs/expression_language/how_to/map) portion of the chain:

In [24]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is synthetic intelligence?")

"I don't know."

## Setting up Pinecone



In [25]:
from langchain_pinecone import PineconeVectorStore

index_name = "youtube-index"

pinecone = PineconeVectorStore.from_documents(
    documents, embeddings, index_name=index_name
)

Let's now run a similarity search on pinecone to make sure everything works:

In [26]:
pinecone.similarity_search(" What is the impact of COVID-19 pandemic to cancner dignosis")[:3]

[Document(page_content='examines trends in cancer diagnoses, deaths, survival, \nand prevalence since 1982.  \nThe impact of the COVID-19 pandemic remains \nan important issue, as again this year we see fewer \ndiagnoses than we would have expected to see.  \nThis expectation is based on historical trends in \ncancer diagnoses and deaths and considers changes \nin our Victorian population. It is difficult to determine \nthe true impact, but based on historical trends \nbetween 1982-2019, we estimate that there were \nabout 6,660 fewer diagnoses than expected for the \nperiod 2020-2022. This excludes prostate cancer,  \nfor which historical trends make it difficult to \naccurately assess missed diagnoses. These statistics \nserve as a salient reminder for Victorians to attend \ntheir general practitioner if they have health \nconcerns and for routine health checks. Our statistics \ndemonstrate that early diagnosis of cancer has  \na survival benefit.   \nThe extended impact of COVID-19 

Let's setup the new chain using Pinecone as the vector store:

In [28]:
chain = (
    {"context": pinecone.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)

chain.invoke(" What is the impact of COVID-19 pandemic to cancner dignosis")

'The impact of the COVID-19 pandemic on cancer diagnoses includes a significant decline in new diagnoses compared to historical trends. It is estimated that there were about 6,660 fewer cancer diagnoses than expected for the period 2020-2022, excluding prostate cancer. Additionally, in 2022, there were 614 fewer than anticipated new diagnoses of bowel cancer. The decline in new diagnoses is attributed to the impact of COVID-19 on healthcare systems and individuals seeking medical attention.'