In [26]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM

In [27]:
# Load environment variables
load_dotenv()
google_api_key = os.getenv("GOOGLE_API_KEY")

In [28]:
# Set environment variable to handle OpenMP duplicate library issue
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

In [29]:

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")



In [30]:
# Load and preprocess PDF
pdf_path = 'jira_issues.pdf'
loader = PyPDFLoader(pdf_path)
docs = loader.load()

In [32]:
# Text splitting configuration
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=300)
documents = text_splitter.split_documents(docs)

In [33]:
# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=google_api_key)

In [34]:
# Create FAISS vector store
try:
    faiss_db = FAISS.from_documents(documents, embeddings)
    print("FAISS vector store created successfully.")
except Exception as e:
    print(f"Error creating FAISS vector store: {e}")

FAISS vector store created successfully.


In [35]:
# Create FAISS vector store
try:
    faiss_db = FAISS.from_documents(documents, embeddings)
    print("FAISS vector store created successfully.")
except Exception as e:
    print(f"Error creating FAISS vector store: {e}")

# Function to generate a response from the language model
def generate_response(prompt, max_length=500):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    
    # Set pad_token_id to eos_token_id if pad_token_id is not set
    pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
    
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=pad_token_id
    )
    
    # Decode the output
    return tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

FAISS vector store created successfully.


In [47]:
import re

def extract_relevant_info(text, query):
    # Extract relevant information based on the query
    # This example assumes that queries contain specific keywords related to criteria
    keywords = re.findall(r'\b\w+\b', query.lower())
    # Simple regex to find matching content in the text
    pattern = '|'.join([f'\\b{keyword}\\b' for keyword in keywords])
    matches = re.findall(pattern, text.lower())
    return ' '.join(matches) if matches else "No relevant information found."



In [39]:
# Define and process query
query = "What are the acceptence criteria for the Sign n Language Detection?Explain"

In [48]:


try:
    # Perform similarity search with the query
    faiss_results = faiss_db.similarity_search(query)
    
    if faiss_results:
        # Get the first result and process it
        first_result = faiss_results[0]
        first_result_text = (first_result['page_content'] 
                             if isinstance(first_result, dict) and 'page_content' in first_result 
                             else str(first_result))
        
        print("FAISS Results:")
        print(first_result_text)
        print("======================================================")
        # Generate a response using the language model
        response_prompt = (
            f"You are the assitant \n\n"
             f"From the following text, provide a single sentence answering the question:\n\n"
            f"Question: {query}\n\n"
            f"Text:\n{first_result_text}\n\n"
            "Provide a concise, one-sentence response directly addressing the question."
        )
        response = generate_response(response_prompt)
        cleaned_response = extract_relevant_info(response, query)
        
        print("Generated Response:======================================================")
        print(response)
    else:
        print("FAISS similarity search returned no results.")
except Exception as e:
    print(f"Error during FAISS similarity search or response generation: {e}")

FAISS Results:
page_content='Issue Key: KAN-1
Summary: Acceptance Criteria for Sign Language Detection
Description: Acceptance Criteria for Sign Language Detection * The system accurately detects and
translates signs from images with at least 90% accuracy. * The user interface is intuitive and displays' metadata={'source': 'jira_issues.pdf', 'page': 0}
are the the the what are the acceptence criteria for the sign n language detection explain criteria for sign language detection criteria for sign language detection the the the are for the what are the acceptence criteria for the sign language detection the the the for sign for sign for sign the


In [49]:
print(response)

You are the assitant 

From the following text, provide a single sentence answering the question:

Question: What are the acceptence criteria for the Sign n Language Detection?Explain

Text:
page_content='Issue Key: KAN-1
Summary: Acceptance Criteria for Sign Language Detection
Description: Acceptance Criteria for Sign Language Detection * The system accurately detects and
translates signs from images with at least 90% accuracy. * The user interface is intuitive and displays' metadata={'source': 'jira_issues.pdf', 'page': 0}

Provide a concise, one-sentence response directly addressing the question.

A:

I think you are looking for the following:

What are the acceptence criteria for the Sign Language Detection?

The answer is:

The system accurately detects and translates signs from images with at least 90% accuracy.

The system is accurate, but it is not perfect. It is not perfect because it is not perfect for every sign. It is not perfect for every sign because it is not perfect for