In [8]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.


### Method 1:(PDF Text Processing and RAG Model Training Pipeline using Transformers)

In [1]:
import os
import string
import nltk
from pdfplumber import open as open_pdf
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define the input PDF file path
pdf_path = r'C:\Users\DELL\Downloads\awsgsg-intro.pdf'

# Preprocessing functions
def extract_text_from_pdf(pdf_path):
    with open_pdf(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def preprocess_text(text):
    # Tokenization
    tokenizer = nltk.word_tokenize(text.lower())

    # Cleaning
    table = str.maketrans("", "", string.punctuation)
    tokenizer = [word.translate(table) for word in tokenizer if word.isalnum()]

    # Normalization
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokenizer = [lemmatizer.lemmatize(word) for word in tokenizer if word not in stop_words]

    # Conversion to a format suitable for the chosen pretrained model
    # In this example, we will use the Hugging Face tokenizer for preprocessing
    tokenizer = " ".join(tokenizer)
    return tokenizer

def train_rag_model(preprocessed_pdf_text):
    model_name = "t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Tokenize the preprocessed text
    inputs = tokenizer(preprocessed_pdf_text, return_tensors="pt", truncation=True, padding=True)

    # Create a dummy label since the Trainer expects labels
    # The actual labels are not used during training as RAG is a generative model
    labels = inputs.input_ids.clone()

    # Define a PyTorch dataset
    class PDFDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.encodings.input_ids)

    # Create an instance of the dataset
    train_dataset = PDFDataset(inputs, labels)

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        logging_steps=50,
        save_steps=50,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
    )

    # Pass the dataset instance to the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    # Train the model
    trainer.train()

    return model


In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [6]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


### Method 2 :PDF Text Extraction and RAG Model Training with T5 Transformers

In [2]:
import string
import nltk
from pdfplumber import open as open_pdf
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch

# Define the input PDF file path
PDF_PATH = r'C:\Users\DELL\Downloads\awsgsg-intro.pdf'

# Preprocessing functions
def extract_text_from_pdf(pdf_path):
    with open_pdf(pdf_path) as pdf:
        text = "".join([page.extract_text() for page in pdf.pages])
    return text

def preprocess_text(text):
    # Tokenization
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text.lower())

    # Removing stopwords and punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back to text
    preprocessed_text = " ".join(tokens)
    return preprocessed_text

def train_rag_model(preprocessed_pdf_text):
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    # Tokenize the preprocessed text
    inputs = tokenizer(preprocessed_pdf_text, return_tensors="pt", padding=True, truncation=True)

    # Create a dummy label since the Trainer expects labels
    labels = inputs.input_ids.clone()

    # Define a PyTorch dataset
    class PDFDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.encodings.input_ids)

    # Create an instance of the dataset
    train_dataset = PDFDataset(inputs, labels)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        logging_steps=50,
        save_steps=50,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
    )

    # Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )

    # Train the model
    trainer.train()

    return model

# Extract text from PDF
pdf_text = extract_text_from_pdf(PDF_PATH)

# Preprocess the text
preprocessed_text = preprocess_text(pdf_text)

# Train RAG model
trained_model = train_rag_model(preprocessed_text)


***** Running training *****
  Num examples = 1
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3
  Number of trainable parameters = 60506624
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


