In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

In [None]:
# Get your API keys from openai, you will need to create an account.
# Here is the link to get the keys: https://platform.openai.com/account/billing/overview
import os
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_KEY"

In [None]:
reader = PdfReader('/content/Data-Science-Notes.pdf')

In [None]:
reader

<PyPDF2._reader.PdfReader at 0x783bd34584f0>

In [None]:
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

In [None]:
raw_text[:100]

'Machine Learning Notes\n- Krishna Shinde\nData Science: If is a process of extracting meaningful infor'

In [None]:
# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits.

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

103

In [None]:
texts[0]

"Machine Learning Notes\n- Krishna Shinde\nData Science: If is a process of extracting meaningful infor-\nmation/knowledge, insight from data by using scienti\x0cc meth-\nods/resources.\nScienti\x0cc Methods/Resources:\n1. Machine Learning(Data sets in the form of CSV, Excel, Mon-\ngoDB, SQLite etc)\n2. Deep Learning(Data sets in the form of images)\n3. Natural Language Processing(NLP)(Data sets in the form of\ntext)\n4. Statistics\n5. Data Visualization(Seaborn, Matplotlib, pandas, Autovis, plotly)\nTypes of Data :\n1.Structured data : Data stored in Excel, CSV etc formats.\n2.Semi-structured data : Data stored in JSON, HTML etc\nformats.\n3.Unstructured Data : Data stored in image, videos, text,\naudio's, pdf etc formats.\nMachine Learning : In the real world, we are surrounded by hu-\nmans who can learn everything from their experiences with their\nlearning capability, and we have computers or machines which\nwork on our instructions. But can a machine also learn from ex-"

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
docsearch = FAISS.from_texts(texts, embeddings)

In [None]:
docsearch

<langchain.vectorstores.faiss.FAISS at 0x783bd2e30700>

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "Who is author of these notes?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The author of these notes is Krishna Shinde.'

In [None]:
query = "What are steps in data science?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The steps in data science are: 1. Problem statement/Business statement, 2. Data gathering, 3. Exploratory data analysis (EDA), 4. Feature engineering, 5. Feature selection, 6. Model training (Building), 7. Model Evaluation, 8. Model testing/Optimization/Improvement, 9. Web Development Framework, 10. Project Development.'

In [None]:
query = "What are assumptions of linear regression?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Linear regression assumes a linear relationship between the dependent and independent variables, small or no multicollinearity between the features, homoscedasticity, normal distribution of error terms, and no autocorrelations.'

In [None]:
query = "What is differen between precision and recall?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Precision is the number of correct outputs provided by the model out of all positive classes that have been predicted correctly by the model. Recall is the number of actual positive classes that have been positively predicted by the model.'

In [None]:
query = "How the KKN classifier works?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The KKN classifier works by first selecting the number K of neighbors. Then, it calculates the Euclidean distance of K number of neighbors. It takes the K nearest neighbors as per the calculated Euclidean distance. Among these k neighbors, it counts the number of the data points in each category. It assigns the new data points to that category for which the number of the neighbors is maximum. Finally, it puts the new data point in the required category.'

In [None]:
!pip freeze > requirements.txt