In [3]:
!pip install --upgrade transformers
!pip install flash_attn
!pip install pdfplumber
!pip install faiss-cpu
!pip install sentence_transformers
!pip install numpy
!pip install datasets
!pip install huggingface_hub
!pip install nltk

Collecting transformers
  Downloading transformers-4.44.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m589.4 kB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.0-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.4
    Uninstalling transformers-4.42.4:
      Successfully uninstalled transformers-4.42.4
Successfully installed transformers-4.44.0
Collecting flash_attn
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->flash_attn)
  Using cached nvidia_c

In [2]:
from huggingface_hub import login

hf_token = "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"
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import pdfplumber
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForCausalLM
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import nltk
from nltk import sent_tokenize
import os
import torch

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased").to(device)

def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def chunk_text(text, chunk_size=128, overlap=50):
    """Chunk the text into smaller pieces."""
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.tokenize(sentence)
        sentence_length = len(sentence_tokens)

        if current_length + sentence_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = current_chunk[-overlap:]  # Maintain overlap
            current_length = len(current_chunk)

        current_chunk.extend(sentence_tokens)
        current_length += sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def process_pdfs_in_directory(directory_path):
    """Extract and chunk text from all PDFs in a directory."""
    all_chunks = []

    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            print(f"Processing {filename}...")

            # Extract text from the PDF
            text = extract_text_from_pdf(pdf_path)

            # Chunk the extracted text
            chunks = chunk_text(text, chunk_size=128, overlap=20)
            all_chunks.extend(chunks)

    return all_chunks

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
directory_path = "/Data/"
chunks = process_pdfs_in_directory(directory_path)

Processing ASEAN-Plan-of-Action-in-Combating-TC_Adopted-by-11th-AMMTC-on-20Sept17.pdf...
Processing AML_CFT (1).pdf...
Processing SOC-2018-web-2.pdf...
Processing National-Strategy-to-Counter-Illicit-Financev2-2.pdf...


In [None]:
def get_embedding(text):
    ''' Convert text chunk into an embedding to put into the faiss index'''
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

    return embedding

# Generate embeddings for all chunks
chunk_embeddings = [get_embedding(chunk) for chunk in chunks]

In [None]:
# Initialize FAISS index
d = 768  # Dimension of the BERT embeddings
index = faiss.IndexFlatL2(d)

chunk_embeddings_array = np.vstack(chunk_embeddings)

index.add(np.array(chunk_embeddings_array, dtype='float32'))

def retrieve_chunks(query, k=5):
    '''Retrieve the most similar chunks to the query for context'''
    # Generate embedding for the query using the correct tokenizer
    inputs = tokenizer(query, return_tensors="pt")
    query_embedding = model(**inputs).pooler_output.detach().numpy()

    # Reshape query_embedding to match the shape required by FAISS (1, d)
    query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)

    # Retrieve the top-k relevant chunk indices from FAISS
    D, I = index.search(query_embedding, k)

    # Extract the actual chunks using the indices retrieved from FAISS
    retrieved_chunks = [chunks[i] for i in I[0]]
    return retrieved_chunks

In [None]:
# Load the tokenizer and model
tokenizer_llm = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
model_llm = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")

# Initialize the pipeline with the model and tokenizer
text_generation_pipeline = pipeline(
    "text-generation",
    model=model_llm,
    tokenizer=tokenizer_llm,
)

def generate_answer(query, retrieved_chunks):
    # Combine retrieved chunks into a context string
    context = " Context:".join(retrieved_chunks)

    query_with_context = query + context

    # Use the pipeline to generate an answer
    output = text_generation_pipeline(query_with_context, max_new_tokens=256)

    # Extract the generated text
    answer = output[0]["generated_text"]

    return answer

# Example query
query = "What are the most effective countermeasures against money laundering that uses financial instruments?"
retrieved_chunks = retrieve_chunks(query, k=8)
answer = generate_answer(query, retrieved_chunks)

print("Answer:", answer)