In [233]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA, LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

from langchain.chains.question_answering import load_qa_chain
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch


In [234]:
pdf_file = 'data/Resume_Harmanpreet.pdf'

In [235]:
def read_pdf(file_path):
    pdf_reader = PdfReader(file_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


In [236]:
text = read_pdf(pdf_file)
print("Document Loaded Successfully!")

# Lowercase the text
text = text.lower()

Document Loaded Successfully!


In [237]:
# Split text into chunks
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

chunks = text_splitter.split_text(text)
print(f"Document split into {len(chunks)} chunks.")

Document split into 12 chunks.


In [238]:
# Generate Embeddings
## We use a pre-trained Sentence Transformer model to convert text chunks into numerical embeddings.

print("Generating embeddings...")
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

Generating embeddings...


In [239]:
# Create a Vector Store (FAISS)
## We store the embeddings in a FAISS vector store for efficient similarity search.

vectorstore = FAISS.from_texts(chunks, embeddings)
print("Embeddings and vector store created.", vectorstore)



Embeddings and vector store created. <langchain_community.vectorstores.faiss.FAISS object at 0x0000024F9C0F1E50>


In [240]:
# Load a Language Model (LLM)
## We load a pre-trained language model from HuggingFace. For demonstration purposes, we'll use a smaller model like gpt2 to ensure it runs smoothly on most machines.
print("Loading language model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
# model_id = 'gpt2'
model_id = 'google/flan-t5-large'

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Set up the text generation pipeline with max_new_tokens to prevent errors
generation_pipeline = pipeline(
    'text2text-generation',
    model=model,
    tokenizer=tokenizer,
    device=0 if device == 'cuda' else -1,
    max_new_tokens=150,  # Adjust as needed
)

llm = HuggingFacePipeline(pipeline=generation_pipeline)
print("Language model loaded.")


Loading language model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Language model loaded.


In [244]:
#  Set Up the RetrievalQA Chain
## We set up a RetrievalQA chain using LangChain. This chain will handle retrieving relevant documents and generating answers.

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# prompt_template = """Answer the following question based on the context below.

# Context:
# {context}

# Question:
# {question}

# Answer:"""

prompt_template = """You are an AI assistant tasked with answering the question based on the provided context.

Context:
{context}

Question:
{question}

Please provide a concise and accurate answer based solely on the context."""


prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = LLMChain(llm=llm, prompt=prompt)

In [245]:
qa_chain = load_qa_chain(llm, chain_type="stuff", prompt=prompt)
qa = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=qa_chain,
    return_source_documents=False
)

In [153]:
# Build the RetrievalQA chain
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="map_reduce",  # Using "map_reduce" to handle longer documents
#     retriever=retriever
# )

# qa = RetrievalQA(
#     retriever=retriever,
#     combine_documents_chain=llm_chain,
#     return_source_documents=False
# )

In [246]:
# # Ask Questions and Generate Answers
# def ask_question(query):
#     print("Generating answer...")

#     max_tokens = 1024
#     tokens = tokenizer(query, return_tensors="pt", max_length=max_tokens, truncation=True).input_ids

#     try:
#         answer = qa.run(query)
#         print("Answer:", answer)
#     except Exception as e:
#         print("Error:", e)

def ask_question(query):
    print(f"Question: {query}")
    print("Generating answer...")
    # Retrieve relevant documents
    retrieved_docs = retriever.get_relevant_documents(query)

    # Print the retrieved documents for debugging
    # print("\n--- Retrieved Documents ---")
    # # for doc in retrieved_docs:
    # #     print(doc.page_content)

    # Combine the retrieved documents into context
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])

    # Determine the maximum input length
    max_input_tokens = tokenizer.model_max_length - 50  # Reserve tokens for the answer and special tokens

    # Tokenize context and question
    input_ids = tokenizer.encode(context + " " + query, return_tensors='pt').to(device)
    input_length = input_ids.shape[1]

    if input_length > max_input_tokens:
        # Truncate the context to fit within max_input_tokens
        print(f"Context is too long ({input_length} tokens), truncating...")
        # Calculate number of tokens to keep
        tokens_to_keep = max_input_tokens - len(tokenizer.encode(query, return_tensors='pt').to(device)[0])
        # Truncate context
        context_ids = tokenizer.encode(context, return_tensors='pt').to(device)[0][:tokens_to_keep]
        context = tokenizer.decode(context_ids, skip_special_tokens=True)
    
    # Prepare inputs for the chain
    inputs = {"context": context, "query": query}

    # Run the chain
    answer = qa.run(inputs)
    print("Answer:")
    print(answer)
    print("-" * 50)




In [247]:
queries = [
    "What is the university name?",
    "What programming languages am I proficient in?",
    "Describe my work experience related to machine learning.",
    "What are my educational qualifications?",
]

for query in queries:
    ask_question(query)

Question: What is the university name?
Generating answer...
Context is too long (711 tokens), truncating...
Answer:
Rutgers
--------------------------------------------------
Question: What programming languages am I proficient in?
Generating answer...
Context is too long (742 tokens), truncating...
Answer:
java, javascript, python, c++, groovy , html/css
--------------------------------------------------
Question: Describe my work experience related to machine learning.
Generating answer...
Context is too long (653 tokens), truncating...
Answer:
developed a language learning web application using next.js and fastapi , with aws services for storage and deployment.
--------------------------------------------------
Question: What are my educational qualifications?
Generating answer...
Context is too long (711 tokens), truncating...
Answer:
masters in computer science
--------------------------------------------------


In [227]:
query = "What is the university name?"


In [228]:
ask_question(query)



Token indices sequence length is longer than the specified maximum sequence length for this model (711 > 512). Running this sequence through the model will result in indexing errors


Question: What is the university name?
Generating answer...

--- Retrieved Documents ---
• comprehensive analysis of enrollment and performance trends over 5 years  in rutgers' foundational computer science courses 
using advanced data analytics and visualization tools.  
• research on gender -based disparities in computer science enrollment and performance, utilizing data science to promote equity 
and inclusion in higher education.  
 
 
academic projects  
 
insightwing: ai -driven web content summarizer         link 
• developed a chrome extension utilizing falconllm and langchain  for efficient 60 -word web content summarization . 
• user-friendly interface with html/css and javascript and integrated a chat feature for interactive content engagement . 
 
global socioeconomic patterns and risk factors in suicide trends       link 
• analyzed the impact of gdp on suicide rates globally using r, revealing key economic correlations . 
• examined age and gender factors affecting suicid

In [196]:
qa.run('what is rutgers')

'computer science'

In [21]:
# Retrieve relevant documents
retrieved_docs = retriever.get_relevant_documents(query)
print(f"Number of documents retrieved: {len(retrieved_docs)}")
for idx, doc in enumerate(retrieved_docs):
    print(f"\nDocument {idx+1} content:\n{doc.page_content}\n")


Number of documents retrieved: 3

Document 1 content:
• comprehensive analysis of enrollment and performance trends over 5 years  in rutgers' foundational computer science courses 
using advanced data analytics and visualization tools.  
• research on gender -based disparities in computer science enrollment and performance, utilizing data science to promote equity 
and inclusion in higher education.  
 
 
academic projects  
 
insightwing: ai -driven web content summarizer         link 
• developed a chrome extension utilizing falconllm and langchain  for efficient 60 -word web content summarization . 
• user-friendly interface with html/css and javascript and integrated a chat feature for interactive content engagement . 
 
global socioeconomic patterns and risk factors in suicide trends       link 
• analyzed the impact of gdp on suicide rates globally using r, revealing key economic correlations . 
• examined age and gender factors affecting suicide, providing insights through data 

  retrieved_docs = retriever.get_relevant_documents(query)


In [None]:
summarization_chain = load_summarize_chain(llm, chain_type="map_reduce")
summary = summarization_chain.run(retrieved_docs)

content = summary

In [None]:
queries = [
    "What programming languages is the person proficient in?",
    "Describe the work experience related to machine learning.",
    "What educational qualifications does the person have?",
]

for query in queries:
    print(f"Question: {query}")
    ask_question(query)
    print("-" * 50)

In [23]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Create a new prompt template that includes truncation logic
prompt_template = """Use the following context to answer the question.

{context}

Question: {question}
Answer:"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = LLMChain(llm=llm, prompt=prompt)

# Create a new RetrievalQA chain with the custom llm_chain
qa = RetrievalQA(
    retriever=retriever,
    combine_documents_chain=llm_chain,
)

# Now run the query
answer = qa.run(query)
print("Answer:")
print(answer)


AttributeError: 'LLMChain' object has no attribute 'get'