## Install and import packages 

In [1]:
# ! pip install langchain==0.0.343
# ! pip install openai==1.3.6

In [2]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, AIMessage, SystemMessage

In [3]:
import os
import openai
openai.api_key = os.environ['OPENAI_API_KEY']

## Generate responses

In [4]:
chat = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
response = chat([HumanMessage(content="Hello Langchain!")])
print(response)


content='Hello! How can I assist you today?'


In [5]:
response_human = chat.predict_messages([HumanMessage(content="what is 1+1")])
print(response_human)

content='1+1 equals 2.'


In [6]:
response_system = chat.predict_messages([
    SystemMessage(content="You are an AI chatbot that does not now machine learning. You should answer 'I don't know' when you are asked about machine learnnig "),
    HumanMessage(content="what is machine learning")
    ])
print(response_system)

content="I don't know"


## Data Connection

### Load Documents

In [7]:
from langchain.document_loaders import TextLoader

loader = TextLoader('docs/Advantages_of_Langchain.md')
docs = loader.load()
print(docs)

[Document(page_content='# Advantages of LangChain\n\n## 1. Connect LLMs to our own data\nRecently, there are many use cases in LLMs. However, LLMs may generate answers that do not meet our expectations in some use cases. As a result, we can solve this problem by using LangChain to connect LLMs to our own data. Therefore, LLMs could do referencing from our data.\n\n## 2. Combine LLMs on doing different tasks in one use case\nFor example, we can use GPT-4 to interpret our queries in a specific case and use Claud-3 to response to those queries. By combining two LLMs in different needs, we could have a better model in some use cases.\n\n## 3. Split the text to suitable length\nAs we known, LLMs usually have limited imput token length. LangChain allows us to split a long document into different sections and input to LLMs. Moreover, LLMs can recognize patterns in the texts. Therefore, text-splitting is very efficient.\n\n\n## 4. Control the format of LLM responses\nWe can provide an example 

### Split Documents

In [8]:
from langchain.text_splitter import CharacterTextSplitter

# character splitting
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

split_docs = text_splitter.split_documents(docs)
print(split_docs)

[Document(page_content='# Advantages of LangChain\n\n## 1. Connect LLMs to our own data\nRecently, there are many use cases in LLMs. However, LLMs may generate answers that do not meet our expectations in some use cases. As a result, we can solve this problem by using LangChain to connect LLMs to our own data. Therefore, LLMs could do referencing from our data.\n\n## 2. Combine LLMs on doing different tasks in one use case\nFor example, we can use GPT-4 to interpret our queries in a specific case and use Claud-3 to response to those queries. By combining two LLMs in different needs, we could have a better model in some use cases.\n\n## 3. Split the text to suitable length\nAs we known, LLMs usually have limited imput token length. LangChain allows us to split a long document into different sections and input to LLMs. Moreover, LLMs can recognize patterns in the texts. Therefore, text-splitting is very efficient.', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_c

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

# code spilitting
PYTHON_CODE = """
def hello_lanchain():
    print("Hello, Langchain!")

# Call the funciton
hello_langchain()
"""

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, 
    chunk_size=50, 
    chunk_overlap=0
)

python_docs = python_splitter.create_documents([PYTHON_CODE])
print(python_docs)

[Document(page_content='def hello_lanchain():'), Document(page_content='print("Hello, Langchain!")'), Document(page_content='# Call the funciton\nhello_langchain()')]


In [10]:
# ! pip install unstructured
# ! pip install markdown

In [11]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

markdown_example = "# Chapter 1\n\n  ## Section 1\n\n This is section 1 ## Section 2\n\n This is section 2"

# Markdown file splitting
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
]

splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
splits = splitter.split_text(markdown_example)
print(splits)

[Document(page_content='This is section 1 ## Section 2  \nThis is section 2', metadata={'Header 1': 'Chapter 1', 'Header 2': 'Section 1'})]


In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Recursive character splitting
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
    length_function=len,
)

texts = text_splitter.split_documents(docs)
print(texts)

[Document(page_content='# Advantages of LangChain', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_content='## 1. Connect LLMs to our own data', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_content='Recently, there are many use cases in LLMs. However, LLMs may generate answers that do not meet our expectations in some use cases. As a result, we can solve this problem by using LangChain to', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_content='we can solve this problem by using LangChain to connect LLMs to our own data. Therefore, LLMs could do referencing from our data.', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_content='## 2. Combine LLMs on doing different tasks in one use case', metadata={'source': 'docs/Advantages_of_Langchain.md'}), Document(page_content='For example, we can use GPT-4 to interpret our queries in a specific case and use Claud-3 to response to those queries. By

In [13]:
from langchain.text_splitter import CharacterTextSplitter

# Token splitting
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100,
    chunk_overlap=0
)

split_docs

[Document(page_content='# Advantages of LangChain\n\n## 1. Connect LLMs to our own data\nRecently, there are many use cases in LLMs. However, LLMs may generate answers that do not meet our expectations in some use cases. As a result, we can solve this problem by using LangChain to connect LLMs to our own data. Therefore, LLMs could do referencing from our data.\n\n## 2. Combine LLMs on doing different tasks in one use case\nFor example, we can use GPT-4 to interpret our queries in a specific case and use Claud-3 to response to those queries. By combining two LLMs in different needs, we could have a better model in some use cases.\n\n## 3. Split the text to suitable length\nAs we known, LLMs usually have limited imput token length. LangChain allows us to split a long document into different sections and input to LLMs. Moreover, LLMs can recognize patterns in the texts. Therefore, text-splitting is very efficient.', metadata={'source': 'docs/Advantages_of_Langchain.md'}),
 Document(page_

## Vectorize Document Chunks
Embedding models create vector representations of text fragments. This means we can process text in vector space and perform operations such as semantic search to find the most similar text fragments in the vector space.

In [14]:
# ! pip install tiktoken

In [15]:
from langchain.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
embedding = embedding_model.embed_documents(
    [
        "Hi!",
        "I am Johnson.",
        "I am great!"
    ]
)
print(embedding)

[[-0.02063614848015033, -0.007340591035842503, -0.014539397511987316, -0.026475109339772086, -0.043411958586483654, 0.021641531766106514, -0.007108579472494051, -0.0031144344322258637, 0.011619917082176438, -0.017516880716219978, 0.028382759868267955, -0.008101073718684498, -0.006364208671435886, -0.024438561894360295, 0.0013550125176492313, -0.018844503500498024, 0.023304283554133484, -0.008171965998533343, 0.017942234395313048, -0.01761999675963648, -0.02724848152804114, 0.0034576182127133938, 0.012019492681738029, -0.00678634137115616, -0.021332183635857007, -0.001889928389856304, 0.003953865568639278, -0.01822580537735372, 0.02411632425868373, -0.03944198346436716, 0.014204269439561039, 0.002136440646810202, -0.01074342885049088, -0.01515809563513162, -0.0007145637033427519, -0.03418305034896323, -0.005439384328737519, -0.004182654755630965, 0.01963076519419414, -0.011897042380180934, 0.02662978340489684, 0.008635990056552892, 0.003322278172898573, -0.00941580699753548, -0.00410209

## Store vector data
Vector data storage, also known as vector databases, is responsible for storing vector representations of text embeddings and providing vector retrieval capabilities. Langchain offers several open-source or commercial vector data storage options, including Chroma, FAISS, Pinecone, and others.

In [16]:
# ! pip install -q chromadb

### Storage

In [17]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
document = text_splitter.split_documents(docs)
db = Chroma.from_documents(document, OpenAIEmbeddings())

### Retrieval

In [18]:
query = "what is Langchain?"
doc = db.similarity_search(query)
print(doc[0].page_content)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


# Advantages of LangChain

## 1. Connect LLMs to our own data
Recently, there are many use cases in LLMs. However, LLMs may generate answers that do not meet our expectations in some use cases. As a result, we can solve this problem by using LangChain to connect LLMs to our own data. Therefore, LLMs could do referencing from our data.

## 2. Combine LLMs on doing different tasks in one use case
For example, we can use GPT-4 to interpret our queries in a specific case and use Claud-3 to response to those queries. By combining two LLMs in different needs, we could have a better model in some use cases.

## 3. Split the text to suitable length
As we known, LLMs usually have limited imput token length. LangChain allows us to split a long document into different sections and input to LLMs. Moreover, LLMs can recognize patterns in the texts. Therefore, text-splitting is very efficient.
