In [1]:
import re
import os
filenames = ['AR22.pdf', 'AR21.pdf', 'AR20.pdf','AR19.pdf', 'AR18-19.pdf', 'AR17-18.pdf','AR16-17.pdf','AR15-16.pdf','AR14-15.pdf','AR13-14.pdf']

In [3]:
# Embedding the chunks

from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Encode input text
# text = "This is an example sentence."

def embed(text, nums):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    num_tokens = inputs['input_ids'].size(1)  # Get the number of tokens
    nums.append(num_tokens)

    # Get token embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings
    # outputs[0] contains the hidden states of the last layer
    token_embeddings = outputs[0]  # Shape: (batch_size, sequence_length, hidden_size)

    # Use the [CLS] token's embedding for sentence embedding
    cls_embedding = token_embeddings[0, 0, :]

    # Alternatively, use mean pooling for sentence embedding
    mean_embedding = torch.mean(token_embeddings, dim=1)

    # print("CLS Embedding:", cls_embedding)
    # print("Mean Embedding:", mean_embedding)
    return mean_embedding.squeeze()

def truncate_text_to_max_tokens(text, max_tokens=512):
    print("truncate")
    inputs = tokenizer(text, return_tensors='pt', max_length=max_tokens, truncation=True)
    return tokenizer.decode(inputs['input_ids'][0])

In [4]:
import fitz
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def chunk_text_with_sentences(preprocessed_text, max_tokens=105):
    tokens = word_tokenize(preprocessed_text)
    chunks = []
    current_chunk = []

    for token in tokens:
        current_chunk.append(token)
        if len(current_chunk) >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []

    if current_chunk:  # If there are remaining tokens
        chunks.append(" ".join(current_chunk))

    return chunks

def extract_and_preprocess_from_pdf(pdf_path):
    try:
        # Open the PDF file
        doc = fitz.open(pdf_path)
        # Extract text from each page
        text = ""
        for page in doc:
            text += page.get_text()

        # Remove unwanted characters
        text = re.sub(r'[^\w\s\d]', '', text)
        # Remove extra spaces and newlines
        text = re.sub(r'\s+', ' ', text).strip()
        text_chunks = chunk_text_with_sentences(text)
        # for i, chunk in enumerate(text_chunks[:3]):
        #     print(f"Chunk {i+1}:\n{chunk}\n")

        return text_chunks
    
    except Exception as e:
            print(f"Failed to extract text from {pdf_path}: {e}")

file_path = "C:\\Users\\DELL\\OneDrive\\Desktop\\LLM1\\webs\\data"
allembeddings = []
text_chunks = []
nums = []
for i in filenames:
    chunks = extract_and_preprocess_from_pdf(file_path+"\\"+i)
    text_chunks.extend(chunks)
    for i in range(len(chunks)):
        allembeddings.append(embed(chunks[i], nums))
    break

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
print(max(nums))       # less than 350 
print(len(text_chunks))
print(len(allembeddings))

328
719
719


In [6]:
print(allembeddings[:3])     # 1-Dimensional
print(text_chunks[:3])

[tensor([-9.9691e-02,  1.1687e-01,  4.4535e-02, -2.7082e-01,  3.0088e-01,
        -1.2663e-01,  2.8675e-01,  1.6831e-01, -1.5807e-01,  4.7651e-02,
         1.0723e-01, -2.8799e-01, -1.4011e-02,  3.2013e-01,  1.6453e-01,
         3.6302e-01,  3.1555e-01, -4.0666e-02, -1.8513e-01,  2.5246e-01,
         1.7306e-01,  1.0341e-01,  8.2670e-02,  5.2068e-01,  2.7486e-01,
         1.8743e-01,  3.3532e-01,  5.8422e-03, -7.0791e-02, -1.1902e-01,
         2.6550e-01, -1.2171e-01, -1.1634e-01, -9.3683e-03,  2.6734e-01,
        -1.8235e-01, -1.7221e-01, -6.9656e-02, -9.3757e-02, -3.9746e-03,
        -2.5641e-01, -3.6629e-01, -1.1712e-01,  1.7169e-01, -1.4471e-01,
        -2.0336e-01,  7.8742e-02,  1.7001e-01,  1.9335e-01, -3.2351e-02,
        -1.2870e-01, -2.7809e-02,  1.5384e-02, -2.4317e-01, -4.0297e-02,
         3.3449e-01, -1.1217e-01, -2.0379e-01, -2.0404e-01, -7.6507e-02,
         4.0112e-01, -8.1121e-03, -9.9733e-02, -1.5715e-01, -2.9636e-02,
         1.6436e-01,  1.9232e-01,  4.8833e-01, -4.

In [8]:
# storage

import pickle
# import fitz

# Saving allembeddings and token_chunks
with open('allembeddings.pkl', 'wb') as f:
    pickle.dump(allembeddings, f)

with open('text_chunks.pkl', 'wb') as f:
    pickle.dump(text_chunks, f)

In [9]:
# import faiss
import pickle
from scipy.spatial.distance import cosine

# Function to find the nearest embedding to the query
def find_nearest_embeddings(query_embedding, embeddings,top_k=5):
    distances = []
    for idx,embedding in enumerate(embeddings):
        distance = cosine(query_embedding, embedding)
        distances.append((idx, embedding, distance))
    distances.sort(key=lambda x: x[2])
    nearest_embeddings = [embedding for __,embedding, _ in distances[:top_k]]
    nearest_ids = [idx for idx,__, _ in distances[:top_k]]
    return (nearest_ids, nearest_embeddings)

def query(query_embedding, top_k=5):
    # Load stored embeddings
    with open('allembeddings.pkl', 'rb') as f:
        loaded_allembeddings = pickle.load(f)
    # Find and return the nearest embeddings
    return find_nearest_embeddings(query_embedding, loaded_allembeddings)

# query
query_text = "Who is the current director of ICAR-CRIDA, and who were the members of the editorial committee for the 2022 annual report?"
query_nums = []
query_embedding = embed(query_text, query_nums)
nearest_ids, nearest_embeddings = query(query_embedding)
print(nearest_ids)

[649, 126, 527, 650, 647]


In [11]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load pre-trained BERT model for question answering
# bert-large-uncased-whole-word-masking-finetuned-squad
# bert-base-uncased
qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
qa_tokenizer = BertTokenizer.from_pretrained(qa_model_name)
qa_model = BertForQuestionAnswering.from_pretrained(qa_model_name)

def generate_answer(query, context):
    # Tokenize text chunk and query
    query_tokens = qa_tokenizer.encode(query, add_special_tokens=False)
    text_chunk_tokens = qa_tokenizer.encode(context, add_special_tokens=False,truncation=True, max_length = 500-len(query_tokens))
    
    # Combine text and query tokens with special tokens
    input_tokens = [qa_tokenizer.cls_token_id] + query_tokens + [tokenizer.sep_token_id] + text_chunk_tokens + [tokenizer.sep_token_id]
    print(len(input_tokens))
    # Convert tokens to tensors
    input_ids = torch.tensor(input_tokens).unsqueeze(0)  # Batch size 1

    # Perform inference
    outputs = qa_model(input_ids)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the answer span with the highest probability
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Get the answer tokens and convert them to string
    answer_tokens = input_tokens[start_index:end_index+1]
    answer = qa_tokenizer.decode(answer_tokens)
    return answer

def query_and_answer(text):
    # context = " ".join([qa_tokenizer.decode(embedding) for embedding in nearest_embeddings])
    with open('text_chunks.pkl','rb') as f:
        loaded_token_chunks = pickle.load(f)
    
    # chunks = [loaded_token_chunks[i] for i in nearest_ids]
    # nearest_ids.sort()
    chunks = [loaded_token_chunks[i] for i in nearest_ids]
    context = " ".join([chunks[0]]) + " " + " ".join([chunks[1]])
    # context = " ".join([chunks[2]]) +" "+ " ".join([chunks[3]])
    # context = " ".join([chunks[4]])
    print(context)
    
    answer = generate_answer(text, context)
    return answer

answer = query_and_answer(query_text)
print("Answer:", answer)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


and members of RAC interacted affirmatively on various research issues raised by the scientists Some of the recommendations that emerged out of the deliberations include the Development of protocols for Natural Farming in rainfed ecologies the Development of the Agroclimatic atlas of India the Development of crop cafeteria and the Assessment of GHGs and carbon footprints under integrated farming systems 122 Institute Research Council IRC The IRC meeting was held on 23rd May 4th 7th 8th 11th and 13th July 2022 Project Coordinators Heads of DivisionsSectionsUnits and institute Scientists participated in the meeting The meeting was chaired by Dr V extends online access to foreign journals through a subscription of Agroforestry Abstracts CAB International Under the National Agricultural Innovation Project NAIP ICAR established a Consortium for eResources in ICARCRIDA Annual Report 2022 8 Agriculture CeRA to access 2000 plus scholarly peer reviewed journals from the renowned publishers in t

In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def generate_answer(query: str, context: str, model_name: str = 'gpt2', max_length: int = 1024) -> str:
    """
    Generate an answer based on the query and context using a GPT-2 model.
    
    :param query: The query to be answered.
    :param context: The context in which to find the answer.
    :param model_name: The name of the GPT-2 model to use (default is 'gpt2').
    :param max_length: The maximum length of the generated answer.
    :return: The generated answer as a string.
    """

    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Prepare the input text
    input_text = f"Context: {context}\n\nQuery: {query}\n\nAnswer:"
    
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the output
    output_ids = model.generate(input_ids, 
                                max_length=max_length, 
                                num_return_sequences=1,
                                no_repeat_ngram_size=2, 
                                temperature=0.5,
                                top_p=0.9,
                                do_sample=True,
                                pad_token_id=tokenizer.eos_token_id)

    # Decode the output
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Post-process the output to extract the answer part
    # answer = answer.split("Answer:")[1].strip()
    answer = answer.split("Answer:")[1].strip().split("\n")[0]

    # Tokenize the answer into sentences
    sentences = sent_tokenize(answer)

    # Reconstruct the answer without the last incomplete sentence
    complete_answer = ' '.join(sentences[:-1]) if not answer.endswith('.') else answer

    print("complete answer: ", complete_answer)
    return answer


def get_context(nearest_ids):
    with open('text_chunks.pkl', 'rb') as f:
        loaded_chunks = pickle.load(f)

    # nearest_ids.sort()
    chunks = [loaded_chunks[i] for i in nearest_ids]
    context = " ".join([i for i in chunks])
    print("context: ", context)
    return context

context = get_context(nearest_ids)
answer = generate_answer(query, context)
print("answer: ", answer)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


context:  and members of RAC interacted affirmatively on various research issues raised by the scientists Some of the recommendations that emerged out of the deliberations include the Development of protocols for Natural Farming in rainfed ecologies the Development of the Agroclimatic atlas of India the Development of crop cafeteria and the Assessment of GHGs and carbon footprints under integrated farming systems 122 Institute Research Council IRC The IRC meeting was held on 23rd May 4th 7th 8th 11th and 13th July 2022 Project Coordinators Heads of DivisionsSectionsUnits and institute Scientists participated in the meeting The meeting was chaired by Dr V extends online access to foreign journals through a subscription of Agroforestry Abstracts CAB International Under the National Agricultural Innovation Project NAIP ICAR established a Consortium for eResources in ICARCRIDA Annual Report 2022 8 Agriculture CeRA to access 2000 plus scholarly peer reviewed journals from the renowned publi