In [2]:
import os 
os.environ['OPENAI_API_KEY'] = ''
os.environ['PINECONE_API_KEY'] = ''

In [2]:
# %pip install faiss-cpu\
# %pip install pinecone-client
# %pip install langchain-pinecone==0.0.3

In [17]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

import numpy as np
from numpy.linalg import norm

from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

import pinecone 
from langchain.vectorstores import Pinecone
from pinecone import Pinecone, ServerlessSpec    
from langchain_pinecone import PineconeVectorStore

In [4]:
file_path = 'mixed_data/element_of_SL.pdf'

loader = PyPDFLoader(file_path=file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0
)

data = loader.load_and_split(text_splitter=text_splitter)
# data

In [5]:
data[0].page_content

'Springer Series in Statistics\nTrevor Hastie\nRobert TibshiraniJerome FriedmanSpringer Series in Statistics\nThe Elements of\nStatistical Learning\nData Mining, Inference, and Prediction\nThe Elements of Statistical LearningDuring the past decade there has been an explosion in computation and information tech-'

In [6]:
embeddings = OpenAIEmbeddings(show_progress_bar=True)

vector1 = embeddings.embed_query('How are you?')

len(vector1)

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.22it/s]


1536

In [1]:
# embeddings.__dict__

In [8]:
def get_cosine(vec1, vec2):
    return np.dot(vec1,vec2)/(norm(vec1)*norm(vec2))

In [9]:
vector1 = embeddings.embed_query('machine learning')
vector2 = embeddings.embed_query('artificial intelligence')
cosine = get_cosine(vector1, vector2)
cosine

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.87it/s]


0.8817139772688366

In [10]:
vector3 = embeddings.embed_query('peperoni pizza')
cosine = get_cosine(vector2, vector3)
cosine

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.09it/s]


0.7398555101973149

In [11]:
index = FAISS.from_documents(data, embeddings)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:18<00:00,  3.66s/it]


In [12]:
index.similarity_search_with_relevance_scores(
    "What is machine learning?"
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.14it/s]


[(Document(page_content='This is page 1\nPrinter: Opaque this\n1\nIntroduction\nStatistical learning plays a key role in many areas of science, ﬁnance and\nindustry. Here are some examples of learning problems:\n•Predict whether a patient, hospitalized due to a heart attack, will\nhave a second heart attack. The prediction is to be based on demo-\ngraphic, diet and clinical measurements for that patient.\n•Predict the price of a stock in 6 months from now, on the basis of\ncompany performance measures and economic data.', metadata={'source': 'mixed_data/element_of_SL.pdf', 'page': 19}),
  0.7542145837625359),
 (Document(page_content='This is page 389\nPrinter: Opaque this\n11\nNeural Networks\n11.1 Introduction\nIn this chapter we describe a class of learning methods that was developed\nseparately in diﬀerent ﬁelds—statistics and artiﬁcial intelligence—based\non essentially identical models. The central idea is to extract linear com-\nbinations of the inputs as derived features, and th

In [14]:
retriever = index.as_retriever()
retriever.search_kwargs['fetch_k'] = 20
retriever.search_kwargs['maximal_marginal_relevance'] = True
retriever.search_kwargs['k'] = 10

llm = ChatOpenAI()

chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    verbose=True
)

handler = StdOutCallbackHandler()

  warn_deprecated(


In [15]:
chain.run(
    'What is machine learning?',
    callbacks=[handler]
)

  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.70it/s]




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
This is page 1
Printer: Opaque this
1
Introduction
Statistical learning plays a key role in many areas of science, ﬁnance and
industry. Here are some examples of learning problems:
•Predict whether a patient, hospitalized due to a heart attack, will
have a second heart attack. The prediction is to be based on demo-
graphic, diet and clinical measurements for that patient.
•Predict the price of a stock in 6 months from now, on the basis of
company performance measures and economic data.

This is page 389
Printer: Opaque this
11
Neural Networks
11.1 Introduction
In this chapter we describe a class of learning methods that was developed
separately in diﬀerent ﬁelds—statistic

'Machine learning is a field that involves developing algorithms and models that allow computers to learn from data and make predictions or decisions without being explicitly programmed to do so. It is a subset of artificial intelligence that focuses on the development of computer programs that can access data and use it to learn for themselves. The goal is to enable computers to learn from data and improve over time without human intervention.'

In [28]:
# pinecone.init(
#     api_key=PINECONE_API_KEY,  # find at app.pinecone.io
#     environment=PINECONE_ENV  # next to api key in console
# )


In [31]:
index_name = "langchain-demo"
# db = Pinecone.from_documents(
#     data, 
#     embeddings, 
#     index_name=index_name
# )

db = PineconeVectorStore.from_documents(
    data,
    index_name=index_name,
    embedding=embeddings
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.98s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.52s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.23s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [33]:
chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=db.as_retriever(),
    verbose=True
)

chain.run(
    'What is machine learning?',
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.34it/s]




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
This is page 1
Printer: Opaque this
1
Introduction
Statistical learning plays a key role in many areas of science, ﬁnance and
industry. Here are some examples of learning problems:
•Predict whether a patient, hospitalized due to a heart attack, will
have a second heart attack. The prediction is to be based on demo-
graphic, diet and clinical measurements for that patient.
•Predict the price of a stock in 6 months from now, on the basis of
company performance measures and economic data.

This is page 389
Printer: Opaque this
11
Neural Networks
11.1 Introduction
In this chapter we describe a class of learning methods that was developed
separately in diﬀerent ﬁelds—statistic

'Machine learning is a field of study that focuses on developing algorithms and statistical models that allow computer systems to learn from and make predictions or decisions based on data without being explicitly programmed to do so. It involves creating systems that can automatically learn and improve from experience without being explicitly programmed for every task. It is used in various applications such as predicting stock prices, identifying patterns in data, image recognition, and more.'