In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install datasets
!pip install transformers 



In [3]:
import torch
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "sst2")
checkpoint = "bert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

2024-06-22 05:28:19.762076: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-22 05:28:19.762221: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-22 05:28:19.864152: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [5]:
real_datasets = raw_datasets['train']

In [6]:
real_datasets 

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [7]:
train_test_split = real_datasets.train_test_split(test_size=0.1, seed=42)
train_test_split

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 60614
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 6735
    })
})

In [8]:
val_train_split = train_test_split['train'].train_test_split(test_size=0.1, seed=42)

In [9]:
val_train_split

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 54552
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 6062
    })
})

In [10]:
new_datasets = DatasetDict({
    'train': val_train_split['train'],
    'validation': val_train_split['test'],
    'test': train_test_split['test']
})

In [11]:
new_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 54552
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 6062
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 6735
    })
})

In [12]:
def tokenize(exemple):
   tokenized_sentence=tokenizer(exemple['sentence'],padding=True,truncation=True)
   return tokenized_sentence

In [13]:
tokinazed_dataset=new_datasets.map(tokenize, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/54552 [00:00<?, ? examples/s]

Map:   0%|          | 0/6062 [00:00<?, ? examples/s]

Map:   0%|          | 0/6735 [00:00<?, ? examples/s]

In [14]:
tokinazed_dataset=tokinazed_dataset.remove_columns(['sentence','idx'])
tokinazed_dataset=tokinazed_dataset.rename_column('label','labels')

In [15]:
tokinazed_dataset.set_format("torch")

In [16]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokinazed_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokinazed_dataset["validation"], batch_size=8, collate_fn=data_collator
)

In [17]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2,output_hidden_states=True)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [19]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

34095


In [20]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [21]:
from tqdm.auto import tqdm
progress_bar=tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch={k:v.to(device) for k, v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

  0%|          | 0/34095 [00:00<?, ?it/s]

In [22]:
!pip install evaluate

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [23]:
test_dataloader = DataLoader(
    tokinazed_dataset["test"], batch_size=8, collate_fn=data_collator
)

In [24]:
import evaluate
metric=evaluate.load("glue","mrpc")
model.eval()
for batch in test_dataloader:
  batch={k:v.to(device) for k,v in batch.items()}
  with torch.no_grad():
    outputs=model(**batch)
  logits=outputs.logits
  predictions=torch.argmax(logits,dim=-1)
  metric.add_batch(predictions=predictions,references=batch['labels'])
metric.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.9510022271714922, 'f1': 0.9554415338914394}

In [25]:
PATH = '/kaggle/working/bertSenti.pth'
torch.save(model.state_dict(), PATH)

In [26]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink('bertSenti.pth')

/kaggle/working
