In [3]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
from pypdf import PdfReader
import json
import os

In [4]:
def extract_pdf(path):
    """Extracts text from a PDF file."""
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

def chunk_text(text, max_tokens=400):
    """Chunks text into segments of approximately max_tokens (words)."""
    words = text.split()
    chunks = []
    current = []
    
    overlap = 50 
    
    for i, w in enumerate(words):
        current.append(w)
        if len(current) >= max_tokens:
            chunks.append(" ".join(current))
            current = words[i - (max_tokens - overlap - 1): i + 1]
    
    if current and len(" ".join(current).split()) < max_tokens:
        chunks.append(" ".join(current))
        
    return chunks

In [5]:
pdf_path = "/kaggle/input/tunerpdf/Behavioural entrepreneurial mindset.pdf"
try:
    pdf_text = extract_pdf(pdf_path)
except FileNotFoundError:
    print(f"Error: PDF not found at {pdf_path}. Please check the path.")
    pdf_text = "Placeholder text for a document about an entrepreneurial mindset. It discusses core behavioral traits and how they affect business outcomes."
    
chunks = chunk_text(pdf_text)

qa_dataset = []

for ch in chunks:
    instruction = "Answer the question based on the context."
    question = "What is the main topic of this passage?"
    answer = " ".join(ch.split()[:50])

    qa_dataset.append({
        "instruction": instruction,
        "context": ch,
        "question": question,
        "answer": answer 
    })

jsonl_file = "train_qa.jsonl"
with open(jsonl_file, "w", encoding="utf-8") as f:
    for item in qa_dataset:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

In [None]:
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128



In [None]:
raw_dataset = load_dataset("json", data_files={"train": jsonl_file})

dataset_dict = raw_dataset["train"].train_test_split(test_size=0.1, seed=42)
print(dataset_dict.keys()) 

def tokenize_function(examples):
    prompts = [
        f"### Instruction:\n{inst}\n\n### Context:\n{ctx}\n\n### Question:\n{q}\n\n### Answer: (Must be a concise summary of the key idea)"
        # Note the added explicit instruction at the end
        for inst, ctx, q in zip(examples['instruction'], examples['context'], examples['question'])
    ]
    
    model_inputs = tokenizer(
        prompts, 
        max_length=MAX_INPUT_LENGTH, 
        padding="max_length", 
        truncation=True,
        return_tensors=None 
    )

    labels = tokenizer(
        examples['answer'], 
        max_length=MAX_TARGET_LENGTH, 
        padding="max_length", 
        truncation=True,
        return_tensors=None
    )
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

tokenized_dataset = dataset_dict.map(
    tokenize_function, 
    batched=True,
    remove_columns=['instruction', 'context', 'question', 'answer'] # Clean up columns
)
print(tokenized_dataset.keys())

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_qa",
    eval_strategy="epoch",
    save_strategy="epoch",
    # save_steps=50,
    # eval_steps=50,
    logging_steps=5,
    learning_rate=3e-5,
    per_device_train_batch_size=4, 
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    generation_max_length=250,
    
    report_to="none" 
)

In [14]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    model=model,
    padding=True 
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete.")

Starting training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,8.5746,6.179919
2,4.313,3.233747
3,2.6929,1.915238
4,1.7057,1.469612
5,1.654,1.231827
6,1.272,1.070309
7,1.1994,0.974144
8,1.0593,0.883781
9,1.027,0.802639
10,0.9057,0.727466




Training complete.


In [16]:
test_text = (
    f"### Instruction:\nAnswer the question based on the context.\n\n"
    f"### Context:\n{chunks[0]}\n\n"
    f"### Question:\nWhat is the main gist of the document ?n\n### Answer:"
)

inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding="max_length", max_length=MAX_INPUT_LENGTH).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=MAX_TARGET_LENGTH)
print("\nQuery Result:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Query Result:
Behavioural entrepreneurial mindset: How entrepreneurial education activity impacts entrepreneurial intention and behaviour Jun Cui & Robin Bell The International Journal of Management Education (2022), 20(2), 100639. Abstract This research investigates how entrepreneurial education activity (EEA) influences entrepreneurial behaviour (EB) by unpacking how EEA influences both entrepreneurial intention (EI) and EB and how behavioural entrepreneurial mindset (BEM) mediates the relationship between EEA and EI. This furthers research into the behavioural subdimension of entrepreneurial mindset and how this impacts the relationship between EEA and EI. Confirm
