Connected to Python 3.11.8

In [2]:
import torch
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_from_disk

# Load the tokenized datasets
tokenized_train = load_from_disk('D:/NLP/data/processed')
tokenized_test = load_from_disk('D:/NLP/data/processed')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',           
    evaluation_strategy='epoch',       
    learning_rate=2e-5,                
    per_device_train_batch_size=16,    
    per_device_eval_batch_size=64,     
    num_train_epochs=3,               
    weight_decay=0.01,                 
)

# Create a Trainer instance
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=tokenized_train,       
    eval_dataset=tokenized_test           
)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
trainer.train()

 11%|█         | 500/4689 [52:07<7:28:46,  6.43s/it]   

{'loss': 0.4074, 'grad_norm': 8.06667423248291, 'learning_rate': 1.7867349114949884e-05, 'epoch': 0.32}


 21%|██▏       | 1000/4689 [1:37:27<5:05:56,  4.98s/it]

{'loss': 0.3483, 'grad_norm': 8.945062637329102, 'learning_rate': 1.5734698229899766e-05, 'epoch': 0.64}


 32%|███▏      | 1500/4689 [2:17:09<4:16:29,  4.83s/it]

{'loss': 0.3111, 'grad_norm': 7.257636070251465, 'learning_rate': 1.3602047344849649e-05, 'epoch': 0.96}


                                                       
 33%|███▎      | 1563/4689 [3:02:43<3:45:17,  4.32s/it]

{'eval_loss': 0.20376761257648468, 'eval_runtime': 2424.4739, 'eval_samples_per_second': 10.312, 'eval_steps_per_second': 0.161, 'epoch': 1.0}


 43%|████▎     | 2000/4689 [3:39:47<4:06:21,  5.50s/it]   

{'loss': 0.2269, 'grad_norm': 4.437220096588135, 'learning_rate': 1.1469396459799531e-05, 'epoch': 1.28}


 53%|█████▎    | 2500/4689 [4:25:54<3:12:04,  5.26s/it]

{'loss': 0.2092, 'grad_norm': 2.313533306121826, 'learning_rate': 9.336745574749414e-06, 'epoch': 1.6}


 64%|██████▍   | 3000/4689 [5:07:05<2:22:05,  5.05s/it]

{'loss': 0.2191, 'grad_norm': 10.264089584350586, 'learning_rate': 7.204094689699297e-06, 'epoch': 1.92}


                                                       
 67%|██████▋   | 3126/4689 [5:57:33<1:50:13,  4.23s/it]

{'eval_loss': 0.10435622185468674, 'eval_runtime': 2406.1835, 'eval_samples_per_second': 10.39, 'eval_steps_per_second': 0.162, 'epoch': 2.0}


 75%|███████▍  | 3500/4689 [6:28:31<1:38:42,  4.98s/it]   

{'loss': 0.1515, 'grad_norm': 2.352600336074829, 'learning_rate': 5.07144380464918e-06, 'epoch': 2.24}


 85%|████████▌ | 4000/4689 [7:07:46<53:15,  4.64s/it]  

{'loss': 0.1334, 'grad_norm': 0.07457254081964493, 'learning_rate': 2.9387929195990615e-06, 'epoch': 2.56}


 96%|█████████▌| 4500/4689 [7:50:01<18:24,  5.85s/it]  

{'loss': 0.14, 'grad_norm': 15.1834716796875, 'learning_rate': 8.061420345489445e-07, 'epoch': 2.88}


                                                     
100%|██████████| 4689/4689 [8:48:42<00:00,  6.77s/it]

{'eval_loss': 0.07582058012485504, 'eval_runtime': 2589.2046, 'eval_samples_per_second': 9.655, 'eval_steps_per_second': 0.151, 'epoch': 3.0}
{'train_runtime': 31722.3668, 'train_samples_per_second': 2.364, 'train_steps_per_second': 0.148, 'train_loss': 0.23439307567737178, 'epoch': 3.0}





TrainOutput(global_step=4689, training_loss=0.23439307567737178, metrics={'train_runtime': 31722.3668, 'train_samples_per_second': 2.364, 'train_steps_per_second': 0.148, 'total_flos': 2483763724800000.0, 'train_loss': 0.23439307567737178, 'epoch': 3.0})

In [5]:
model.save_pretrained('./distilbert-imdb')

In [7]:
model.eval

<bound method Module.eval of DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dr