Project Overview


Objective: Build a system that generates relevant questions from given text passages using the T5 model.

Dataset

SQuAD (Stanford Question Answering Dataset): Contains passages and corresponding questions and answers.

I  choosed this dataset cause it's very common for this kind of task.

In [11]:
from datasets import load_dataset

dataset = load_dataset('squad')


Data Preparation

In [12]:
def prepare_data(example):
    example['source_text'] = "generate question: " + example['context']
    example['target_text'] = example['question']
    return example

dataset = dataset.map(prepare_data)


Tokenization

In [18]:
def tokenize(batch):
    from transformers import T5Tokenizer  # Import inside the function
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    
    source = tokenizer(
        batch['source_text'],
        padding='max_length',
        truncation=True,
        max_length=512
    )
    target = tokenizer(
        batch['target_text'],
        padding='max_length',
        truncation=True,
        max_length=64
    )

    batch['input_ids'] = source['input_ids']
    batch['attention_mask'] = source['attention_mask']
    batch['labels'] = target['input_ids']
    return batch


# Tokenize the dataset
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    batch_size=1000,       
    num_proc=4,           
    remove_columns=dataset['train'].column_names,
)



Map (num_proc=4): 100%|██████████| 87599/87599 [00:28<00:00, 3113.73 examples/s]
Map (num_proc=4): 100%|██████████| 10570/10570 [00:08<00:00, 1241.83 examples/s]


Fine-Tuning the T5 Model

In [19]:
# Proceed with model initialization and training
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [20]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_steps=10_000,
    save_total_limit=2,
)




In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

In [22]:
trainer.train()


  1%|          | 500/65700 [19:02<41:43:09,  2.30s/it]

{'loss': 1.0469, 'grad_norm': 1.2295717000961304, 'learning_rate': 4.961948249619483e-05, 'epoch': 0.02}


  2%|▏         | 1000/65700 [38:51<43:52:42,  2.44s/it]

{'loss': 0.5507, 'grad_norm': 1.3147540092468262, 'learning_rate': 4.923896499238965e-05, 'epoch': 0.05}


  2%|▏         | 1500/65700 [58:45<42:50:44,  2.40s/it]

{'loss': 0.5289, 'grad_norm': 1.0878664255142212, 'learning_rate': 4.8858447488584476e-05, 'epoch': 0.07}


  3%|▎         | 2000/65700 [1:18:27<48:13:05,  2.73s/it]

{'loss': 0.5288, 'grad_norm': 1.2267041206359863, 'learning_rate': 4.84779299847793e-05, 'epoch': 0.09}


  4%|▍         | 2500/65700 [1:38:40<43:12:41,  2.46s/it]

{'loss': 0.5173, 'grad_norm': 1.1972581148147583, 'learning_rate': 4.8097412480974124e-05, 'epoch': 0.11}


  5%|▍         | 3000/65700 [1:58:41<42:08:41,  2.42s/it]

{'loss': 0.5158, 'grad_norm': 1.1055465936660767, 'learning_rate': 4.7716894977168955e-05, 'epoch': 0.14}


  5%|▌         | 3500/65700 [2:18:31<40:44:39,  2.36s/it]

{'loss': 0.5113, 'grad_norm': 0.6913097500801086, 'learning_rate': 4.733637747336377e-05, 'epoch': 0.16}


  6%|▌         | 4000/65700 [2:37:59<39:48:18,  2.32s/it]

{'loss': 0.5108, 'grad_norm': 1.547231912612915, 'learning_rate': 4.6955859969558604e-05, 'epoch': 0.18}


  7%|▋         | 4500/65700 [2:57:20<39:17:26,  2.31s/it]

{'loss': 0.5048, 'grad_norm': 1.8858858346939087, 'learning_rate': 4.657534246575342e-05, 'epoch': 0.21}


  8%|▊         | 5000/65700 [3:17:39<46:34:01,  2.76s/it]

{'loss': 0.522, 'grad_norm': 0.9628155827522278, 'learning_rate': 4.619482496194825e-05, 'epoch': 0.23}


  8%|▊         | 5500/65700 [3:39:17<42:22:26,  2.53s/it]

{'loss': 0.498, 'grad_norm': 0.85796719789505, 'learning_rate': 4.581430745814308e-05, 'epoch': 0.25}


  9%|▉         | 6000/65700 [4:01:22<44:12:29,  2.67s/it]

{'loss': 0.4953, 'grad_norm': 1.0098198652267456, 'learning_rate': 4.54337899543379e-05, 'epoch': 0.27}


 10%|▉         | 6500/65700 [4:33:41<81:18:48,  4.94s/it]

{'loss': 0.4981, 'grad_norm': 1.1616727113723755, 'learning_rate': 4.5053272450532726e-05, 'epoch': 0.3}


 11%|█         | 7000/65700 [5:14:41<36:14:59,  2.22s/it]

{'loss': 0.5052, 'grad_norm': 1.3076822757720947, 'learning_rate': 4.467275494672755e-05, 'epoch': 0.32}


 11%|█▏        | 7500/65700 [5:33:22<36:39:29,  2.27s/it]

{'loss': 0.4944, 'grad_norm': 1.103482961654663, 'learning_rate': 4.4292237442922375e-05, 'epoch': 0.34}


 12%|█▏        | 8000/65700 [5:52:05<35:30:53,  2.22s/it]

{'loss': 0.4993, 'grad_norm': 1.1078176498413086, 'learning_rate': 4.39117199391172e-05, 'epoch': 0.37}


 13%|█▎        | 8500/65700 [6:10:49<35:25:54,  2.23s/it]

{'loss': 0.5002, 'grad_norm': 1.2114601135253906, 'learning_rate': 4.3531202435312024e-05, 'epoch': 0.39}


 14%|█▎        | 9000/65700 [6:30:07<35:59:22,  2.29s/it]

{'loss': 0.4969, 'grad_norm': 1.2767448425292969, 'learning_rate': 4.3150684931506855e-05, 'epoch': 0.41}


 14%|█▍        | 9500/65700 [6:49:06<35:53:38,  2.30s/it]

{'loss': 0.5003, 'grad_norm': 0.93707674741745, 'learning_rate': 4.277016742770167e-05, 'epoch': 0.43}


 15%|█▌        | 10000/65700 [7:08:07<35:26:00,  2.29s/it]

{'loss': 0.4806, 'grad_norm': 1.260071873664856, 'learning_rate': 4.2389649923896504e-05, 'epoch': 0.46}


 16%|█▌        | 10500/65700 [7:27:06<33:19:55,  2.17s/it]

{'loss': 0.4896, 'grad_norm': 1.475561499595642, 'learning_rate': 4.200913242009132e-05, 'epoch': 0.48}


 17%|█▋        | 11000/65700 [7:45:30<32:55:39,  2.17s/it]

{'loss': 0.4817, 'grad_norm': 1.476589322090149, 'learning_rate': 4.162861491628615e-05, 'epoch': 0.5}


 18%|█▊        | 11500/65700 [8:03:36<32:33:21,  2.16s/it]

{'loss': 0.4994, 'grad_norm': 0.9116871356964111, 'learning_rate': 4.124809741248098e-05, 'epoch': 0.53}


 18%|█▊        | 12000/65700 [8:21:40<32:25:15,  2.17s/it]

{'loss': 0.4925, 'grad_norm': 1.3479228019714355, 'learning_rate': 4.08675799086758e-05, 'epoch': 0.55}


 19%|█▉        | 12500/65700 [8:39:53<32:26:29,  2.20s/it]

{'loss': 0.4812, 'grad_norm': 0.8781104683876038, 'learning_rate': 4.0487062404870626e-05, 'epoch': 0.57}


 20%|█▉        | 13000/65700 [8:58:02<31:19:55,  2.14s/it]

{'loss': 0.4857, 'grad_norm': 1.1671618223190308, 'learning_rate': 4.010654490106545e-05, 'epoch': 0.59}


 21%|██        | 13500/65700 [9:16:08<31:59:11,  2.21s/it]

{'loss': 0.4839, 'grad_norm': 0.9998559355735779, 'learning_rate': 3.9726027397260274e-05, 'epoch': 0.62}


 21%|██▏       | 14000/65700 [9:34:16<30:59:38,  2.16s/it]

{'loss': 0.4757, 'grad_norm': 1.1517494916915894, 'learning_rate': 3.93455098934551e-05, 'epoch': 0.64}


 22%|██▏       | 14500/65700 [9:52:25<31:04:39,  2.19s/it]

{'loss': 0.4817, 'grad_norm': 0.8982422947883606, 'learning_rate': 3.896499238964992e-05, 'epoch': 0.66}


 23%|██▎       | 15000/65700 [10:10:34<30:54:06,  2.19s/it]

{'loss': 0.4964, 'grad_norm': 0.9464734196662903, 'learning_rate': 3.8584474885844754e-05, 'epoch': 0.68}


 24%|██▎       | 15500/65700 [10:28:44<29:57:38,  2.15s/it]

{'loss': 0.4864, 'grad_norm': 0.9132115244865417, 'learning_rate': 3.820395738203957e-05, 'epoch': 0.71}


 24%|██▍       | 16000/65700 [10:46:52<29:50:13,  2.16s/it]

{'loss': 0.4849, 'grad_norm': 1.2029144763946533, 'learning_rate': 3.78234398782344e-05, 'epoch': 0.73}


 25%|██▌       | 16500/65700 [11:05:23<30:17:31,  2.22s/it]

{'loss': 0.4836, 'grad_norm': 1.0474565029144287, 'learning_rate': 3.744292237442922e-05, 'epoch': 0.75}


 26%|██▌       | 17000/65700 [11:23:56<29:47:03,  2.20s/it]

{'loss': 0.4806, 'grad_norm': 1.1400055885314941, 'learning_rate': 3.706240487062405e-05, 'epoch': 0.78}


 27%|██▋       | 17500/65700 [11:42:38<29:54:50,  2.23s/it]

{'loss': 0.4807, 'grad_norm': 0.8473324179649353, 'learning_rate': 3.6681887366818876e-05, 'epoch': 0.8}


 27%|██▋       | 18000/65700 [12:01:19<29:24:57,  2.22s/it]

{'loss': 0.4867, 'grad_norm': 0.981948733329773, 'learning_rate': 3.63013698630137e-05, 'epoch': 0.82}


 28%|██▊       | 18500/65700 [12:20:00<29:20:36,  2.24s/it]

{'loss': 0.4868, 'grad_norm': 0.9347368478775024, 'learning_rate': 3.5920852359208525e-05, 'epoch': 0.84}


 29%|██▉       | 19000/65700 [12:38:41<29:09:20,  2.25s/it]

{'loss': 0.4795, 'grad_norm': 0.8082241415977478, 'learning_rate': 3.554033485540335e-05, 'epoch': 0.87}


 30%|██▉       | 19500/65700 [12:57:23<28:41:07,  2.24s/it]

{'loss': 0.4752, 'grad_norm': 1.0815788507461548, 'learning_rate': 3.5159817351598174e-05, 'epoch': 0.89}


 30%|███       | 20000/65700 [22:11:12<28:08:54,  2.22s/it]      

{'loss': 0.478, 'grad_norm': 0.9095015525817871, 'learning_rate': 3.4779299847793e-05, 'epoch': 0.91}


 31%|███       | 20500/65700 [22:55:19<71:13:47,  5.67s/it]

{'loss': 0.47, 'grad_norm': 0.9896535277366638, 'learning_rate': 3.439878234398782e-05, 'epoch': 0.94}


 32%|███▏      | 21000/65700 [23:30:49<65:08:32,  5.25s/it]

{'loss': 0.4711, 'grad_norm': 1.4654635190963745, 'learning_rate': 3.4018264840182654e-05, 'epoch': 0.96}


 32%|███▏      | 21236/65700 [23:47:51<27:57:05,  2.26s/it] 

KeyboardInterrupt: 

In [None]:
# Save the trained model
trainer.save_model('./trained_t5_question_generator')