# Fine-tuning using Microsoft's LoRA

git clone https://github.com/microsoft/LoRA.git
cd LoRA
pip install -e 

In [None]:
#from pdfreader import SimplePDFViewer
import os
import pdfplumber

def pdf_to_txt(pdf_file_path, txt_file_path):
    # Open the PDF file
    with pdfplumber.open(pdf_file_path) as pdf:
        # Initialize an empty string to store the extracted text
        extracted_text = ''

        # Loop over each page in the PDF
        for page in pdf.pages:
            # Extract the text from the page
            page_text = page.extract_text()

            # Append the page text to the extracted text
            extracted_text += ' ' + page_text

    # Write the extracted text to a text file
    with open(txt_file_path, 'w') as txt_file:
        txt_file.write(extracted_text)


#This function does the work but the result is difficult to tokenize
#def convert_pdf_to_txt(file_path):
    # Open the PDF file in read-binary mode
#    with open(file_path, 'rb') as file:
        # Create a PDF file viewer object
#        viewer = SimplePDFViewer(file)

        # Initialize an empty string to hold the extracted text
#        text = ''

        # Loop through each page in the PDF and extract the text
#       for canvas in viewer:
#            viewer.render()
#            text += ''.join(viewer.canvas.strings)

    # Create the Texts directory if it doesn't exist
#    if not os.path.exists('Texts'):
#        os.makedirs('Texts')

    # Write the extracted text to a .txt file in the Texts directory
#    with open('Texts/output1.txt', 'w') as output_file:
#        output_file.write(text)

# Call the function with the path to your PDF file
#convert_pdf_to_txt("PDFs/WUDC_manual.pdf")
pdf_to_txt("PDFs/WUDC_manual.pdf", "Texts/output2.txt")

In [3]:
#Read the resulting text file
with open("Texts/output2.txt", "r") as file:
    text = file.read()
    print(text)

 Debating & Judging Manual
Debating & Judging Manual
A note about the authorship of this manual
The World Universities Debating Championships (‘WUDC’ or ‘Worlds’) Debating and Judging Manual
was initially compiled in advance of the 35th World Championships in Malaysia. Before that, in the
three and a half decade history of Worlds, there had not been a single authoritative document,
beyond the WUDC Constitution, which specified how debating and judging takes place at the World
Championships.
This Manual is the product of the time, work, wisdom and effort of many adjudication cores and debating
intellectuals and academics.
The Korea WUDC 2021 Adjudication Core (Bobbi Leet, Boemo Phirinyane, Connor O’Brien, Dan Lahav,
Milos Marjanovic, Mubarrat Wassey, Sebastian Dasso, Sooyoung Park, Tejal Patwardhan, Teck Wei Tan) has
worked together with the Belgrade WUDC 2022 Adjudication Core (Brent Schmidt, Enting Lee, Hadar
Goldberg, Juanita Hincapie Restrepo, Milos Marjanovic, Noluthando Honono, Ro

In [None]:
from sklearn.model_selection import train_test_split

# Load the data
with open('Texts/output2.txt', 'r') as f:
    sentences = f.read().splitlines()

# Split the data into train and validation sets
train_sentences, val_sentences = train_test_split(sentences, test_size=0.2, random_state=42)

len(train_sentences), len(val_sentences)

In [None]:
import torch
from transformers import RobertaTokenizerFast

# Initialize the tokenizer with a pretrained RoBERTa model
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

def preprocess(sentences):
    # Tokenize the sentences
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        # `encode_plus` will:
        #   (1) Tokenize the sentence
        #   (2) Prepend the `[CLS]` token to the start and append the `[SEP]` token to the end
        #   (3) Map tokens to their IDs
        #   (4) Create the attention mask
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Return a dictionary of outputs
        encoded_dict = tokenizer.encode_plus(
            sentence,                      # Sentence to encode
            add_special_tokens=True,       # Add '[CLS]' and '[SEP]'
            max_length=64,                 # Pad & truncate all sentences
            pad_to_max_length=True,
            return_attention_mask=True,    # Construct attention masks
            return_tensors='pt',           # Return pytorch tensors
        )
        
        # Add the encoded sentence to the list
        input_ids.append(encoded_dict['input_ids'])
        
        # And its attention mask (simply differentiates padding from non-padding)
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

# Apply the preprocessing to the training and validation sentences
train_input_ids, train_attention_masks = preprocess(train_sentences)
val_input_ids, val_attention_masks = preprocess(val_sentences)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert the lists into datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks)
val_dataset = TensorDataset(val_input_ids, val_attention_masks)

# Create the DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset), # Random sampler for training
    batch_size=32
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset), # Sequential sampler for validation
    batch_size=32
)

In [None]:
from transformers import RobertaConfig, RobertaModel
from loralib import Embedding, Linear

# Load the pretrained RoBERTa model
config = RobertaConfig.from_pretrained('roberta-base')
model = RobertaModel(config)

# Replace the embedding layer with a LoRA embedding layer
model.roberta.embeddings.word_embeddings = Embedding(
    config.vocab_size, config.hidden_size, r=32, lora_alpha=1
)

# Replace the pooler layer with a LoRA linear layer
model.roberta.pooler.dense = Linear(
    config.hidden_size, config.hidden_size, r=32, lora_alpha=1
)


In [2]:
import os
from pdf2image import convert_from_path

def convert_pdf_to_images(pdf_file_path, output_folder, base_name):
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Convert the PDF to a list of images
    images = convert_from_path(pdf_file_path)
    
    # Save each image to the output folder
    for i, image in enumerate(images, start=1):
        image.save(os.path.join(output_folder, f"{base_name}_{i}.jpg"), 'JPEG')

# Use the function
convert_pdf_to_images("PDFs/WUDC_manual.pdf", "JPEGs", "Manual")


In [5]:
from transformers import pipeline

# Initialize the pipeline
document_answering_model = pipeline(task='question-answering', model='deepset/roberta-base-squad2')

# Ask a question
# Text defined above
question = "What is the meaning of life?"
answer = document_answering_model(question=question, context=text)

print(answer)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

{'score': 0.02243351750075817, 'start': 66656, 'end': 66685, 'answer': 'save the most number of lives'}


In [7]:
question = "What are the type of motions and how do they work?"
answer = document_answering_model(question=question, context=text)

print(answer)

{'score': 0.4675463140010834, 'start': 72179, 'end': 72201, 'answer': 'value judgment debates'}


In [10]:
import os
import pytesseract
from transformers import pipeline

# Define the directory
directory = "JPEGs"

# List all files in the directory
files = os.listdir(directory)

# Filter for .jpg files and create full paths
image_paths = [os.path.join(directory, file) for file in files if file.endswith(".jpg")]

print(image_paths)

['JPEGs/Manual_30.jpg', 'JPEGs/Manual_24.jpg', 'JPEGs/Manual_18.jpg', 'JPEGs/Manual_19.jpg', 'JPEGs/Manual_25.jpg', 'JPEGs/Manual_31.jpg', 'JPEGs/Manual_27.jpg', 'JPEGs/Manual_33.jpg', 'JPEGs/Manual_32.jpg', 'JPEGs/Manual_26.jpg', 'JPEGs/Manual_22.jpg', 'JPEGs/Manual_36.jpg', 'JPEGs/Manual_37.jpg', 'JPEGs/Manual_23.jpg', 'JPEGs/Manual_35.jpg', 'JPEGs/Manual_8.jpg', 'JPEGs/Manual_21.jpg', 'JPEGs/Manual_9.jpg', 'JPEGs/Manual_20.jpg', 'JPEGs/Manual_34.jpg', 'JPEGs/Manual_53.jpg', 'JPEGs/Manual_47.jpg', 'JPEGs/Manual_46.jpg', 'JPEGs/Manual_52.jpg', 'JPEGs/Manual_44.jpg', 'JPEGs/Manual_50.jpg', 'JPEGs/Manual_51.jpg', 'JPEGs/Manual_45.jpg', 'JPEGs/Manual_41.jpg', 'JPEGs/Manual_55.jpg', 'JPEGs/Manual_54.jpg', 'JPEGs/Manual_40.jpg', 'JPEGs/Manual_56.jpg', 'JPEGs/Manual_42.jpg', 'JPEGs/Manual_43.jpg', 'JPEGs/Manual_48.jpg', 'JPEGs/Manual_49.jpg', 'JPEGs/Manual_11.jpg', 'JPEGs/Manual_4.jpg', 'JPEGs/Manual_39.jpg', 'JPEGs/Manual_38.jpg', 'JPEGs/Manual_5.jpg', 'JPEGs/Manual_10.jpg', 'JPEGs/Manual_

In [12]:

# Initialize the pipeline
document_answering_model = pipeline(task='document-question-answering', model='impira/layoutlm-document-qa')

# Iterate over the images, run OCR, and use the QA model
for image_path in image_paths:
    text = pytesseract.image_to_string(image_path)
    question = "What is the meaning of life?"
    answer = document_answering_model(question=question, context=text, image=image_path)

    # Print the best answer if its score is above 0.75
    if answer[0]['score'] > 0.75:
        print(answer[0])


{'score': 0.9208416938781738, 'answer': 'Iron-personing', 'start': 34, 'end': 34}
{'score': 0.9828732013702393, 'answer': 'Debating', 'start': 9, 'end': 9}


In [13]:
# Iterate over the images, run OCR, and use the QA model
for image_path in image_paths:
    text = pytesseract.image_to_string(image_path)
    question = "How does the WUDC debate format work?"
    answer = document_answering_model(question=question, context=text, image=image_path)

    # Print the best answer if its score is above 0.75
    if answer[0]['score'] > 0.75:
        print(answer[0])

{'score': 0.9556881189346313, 'answer': 'Extending the Debate', 'start': 8, 'end': 10}
{'score': 0.85063636302948, 'answer': 'Debating and Judging', 'start': 5, 'end': 7}
