## Simple rag application

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA

GOOGLE_API_KEY = "AIzaSyCB5NLx39vOAlfRQBDmnEG3uLBgLraGvH4"
# Load and split the PDF
pdf_loader = PyPDFLoader("Regression.pdf")
pdf_pages = pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_documents = text_splitter.split_documents(pdf_pages)

# Extract text content
texts = [doc.page_content for doc in split_documents]

# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=GOOGLE_API_KEY
)

# Create FAISS vector store
vectorstore = FAISS.from_texts(texts, embedding=embeddings)

# Save the vector store
vectorstore.save_local("faiss_index")

# Load the vector store
vectorstore = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)

# Convert to retriever
retriever = vectorstore.as_retriever()


# Initialize the Gemini Model
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0.7,
    google_api_key=GOOGLE_API_KEY)

# RetrievalQA Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)
question = "assumptions of regression?"
answer = qa_chain.run(question)

In [31]:
question = "assumptions of regression?"
answer = qa_chain.run(question)


In [33]:
print(answer)

The assumptions of linear regression are:

1.  Linearity: The relationship between inputs (X) and the output (Y) is a straight line.
2.  Independence of Errors: The errors in predictions should not affect each other.
3.  Constant Variance (Homoscedasticity): The errors should have equal spread across all values of the input.
4.  Normality of Errors: The errors should follow a normal (bell-shaped) distribution.
5.  No Multicollinearity (for multiple regression): Input variables shouldn’t be too closely related to each other.
6.  No Autocorrelation: Errors shouldn't show repeating patterns, especially in time-based data.
7.  Additivity: The total effect on Y is just the sum of effects from each X, no mixing or interaction between them.
