In [1]:
!pip install transformers datasets torch safetensors pypdf



In [2]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

pdf_path = "/content/LABOUR_ACT.pdf"  # Update with your actual file path
dataset_text = extract_text_from_pdf(pdf_path)

print(dataset_text[:1000])  # Print first 1000 characters to check output





In [13]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue

tokenized_data = tokenizer(
    dataset_text,
    truncation=True,
    padding="max_length",
    max_length=512,
    return_tensors="pt"
)

print(tokenized_data["input_ids"].shape)  # Check if tokens are generated


torch.Size([1, 512])


In [14]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
        }

tokenized_dataset = TextDataset(tokenized_data)
print(len(tokenized_dataset))  # Should be > 0


1


In [5]:
from transformers import GPT2LMHeadModel

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Resize the token embeddings because we added a PAD token
model.resize_token_embeddings(len(tokenizer))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Model loaded and ready for training on", device)


Model loaded and ready for training on cuda


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2_finetuned",  # Directory to save the model
    overwrite_output_dir=True,  # Overwrite previous training results
    num_train_epochs=3,  # Number of training epochs (you can increase if needed)
    per_device_train_batch_size=4,  # Batch size for training
    save_steps=500,  # Save the model every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    prediction_loss_only=True,
    logging_dir="./logs",  # Directory for logging training progress
)

print("Training arguments set!")


Training arguments set!


In [7]:
# Tokenizing the dataset properly
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer

# Define padding token
tokenizer.pad_token = tokenizer.eos_token

class TextDataset(Dataset):
    def __init__(self, text_list, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.input_ids = []

        for text in text_list:
            encoding = tokenizer(text, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
            self.input_ids.append(encoding["input_ids"].squeeze(0))  # Remove batch dimension

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {"input_ids": self.input_ids[idx]}

# Ensure dataset_text is a list (if it's a single string, wrap it in a list)
if isinstance(dataset_text, str):
    dataset_text = [dataset_text]

# Create tokenized dataset
tokenized_dataset = TextDataset(dataset_text, tokenizer)

print("Dataset tokenized and ready for training!")


Dataset tokenized and ready for training!


In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,  # Reduce if memory issues
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    evaluation_strategy="no",
    save_total_limit=1,
    report_to="none",  # Disable W&B if not needed
)




In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.05162122845649719, metrics={'train_runtime': 7.519, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.399, 'total_flos': 783876096000.0, 'train_loss': 0.05162122845649719, 'epoch': 3.0})

In [16]:
class TextDataset(Dataset):
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data["input_ids"]
        self.attention_mask = tokenized_data["attention_mask"]
        self.labels = tokenized_data["input_ids"].clone()  # Labels must be identical to input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx],  # Add labels
        }

tokenized_dataset = TextDataset(tokenized_data)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=3, training_loss=4.354833602905273, metrics={'train_runtime': 10.4054, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.288, 'total_flos': 783876096000.0, 'train_loss': 4.354833602905273, 'epoch': 3.0})

In [20]:
from transformers import pipeline

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
prompt = "Enter a sample prompt here"
output = generator(prompt, max_length=100)
print(output)


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Enter a sample prompt here\n\nAnAn\n\n1 hr. 1209\n\n11.'}]


In [21]:
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")


('fine_tuned_gpt2/tokenizer_config.json',
 'fine_tuned_gpt2/special_tokens_map.json',
 'fine_tuned_gpt2/vocab.json',
 'fine_tuned_gpt2/merges.txt',
 'fine_tuned_gpt2/added_tokens.json')

In [24]:
import os
print(os.listdir("/content"))

['.config', 'fine_tuned_gpt2', 'LABOUR_ACT.pdf', 'results', 'gpt2_finetuned', 'logs', 'sample_data']


In [25]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "/content/fine_tuned_gpt2"  # Update this if needed

# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Move model to CUDA
model.to("cuda")


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [43]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Set model path
model_path = "fine_tuned_gpt2"

# Load tokenizer
try:
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token  # Set pad token to EOS token
except Exception as e:
    print("Error loading tokenizer:", e)
    exit()

# Load model
try:
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()  # Set to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
except Exception as e:
    print("Error loading model:", e)
    exit()

def generate_text(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
        top_p=0.95,  # Balanced randomness
        temperature=0.3,  # Reduces hallucination
        repetition_penalty=2.0,  # Avoids repetition
        do_sample=True,
        num_beams=5  # Enhances factual correctness
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "According to the Nigerian Labour Act, employees are entitled to"
generated_text = generate_text(prompt)
print(generated_text)


According to the Nigerian Labour Act, employees are entitled to compensation of up to $1,000 for each day they work in Nigeria and up to $2,000 for each day they do not work in Nigeria.

The law also states that if an employee does not work in Nigeria, he or she is entitled to compensation of up to $1,000 for each day he or she does not work in Nigeria.


In [44]:
test_prompts = [
    "According to the Nigerian Labour Act, employees are entitled to",
    "The rights of Nigerian workers include",
    "Employers in Nigeria must provide"
]

for prompt in test_prompts:
    print("Prompt:", prompt)
    print("Generated:", generate_text(prompt))
    print("-" * 80)


Prompt: According to the Nigerian Labour Act, employees are entitled to
Generated: According to the Nigerian Labour Act, employees are entitled to compensation of up to $1,000 for each day they work in Nigeria and up to $2,000 for each day they do not work in Nigeria.

The Nigerian Labour Act states that if a person is employed in Nigeria for more than one year, he or she may be entitled to compensation of up to $1,000 for each day they work in Nigeria and up to $2,000 for each day they do not work in Nigeria.
--------------------------------------------------------------------------------
Prompt: The rights of Nigerian workers include
Generated: The rights of Nigerian workers include the right to self-determination and the right to a fair trial.

"We are very concerned about the situation in Nigeria," he said.
--------------------------------------------------------------------------------
Prompt: Employers in Nigeria must provide
Generated: Employers in Nigeria must provide a minimum