In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import string
import torch
import re
import sklearn
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Pre-processing

In [None]:
train = pd.read_json("train.jsonl",lines=True)
dev = pd.read_json("dev.jsonl",lines=True)
train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [None]:
del train['id']
del train['img']
train.head()

Unnamed: 0,label,text
0,0,its their character not their color that matters
1,0,don't be afraid to love again everyone is not ...
2,0,putting bows on your pet
3,0,i love everything and everybody! except for sq...
4,0,"everybody loves chocolate chip cookies, even h..."


In [None]:
del dev['id']
del dev['img']
dev.head()

Unnamed: 0,label,text
0,1,white people is this a shooting range
1,1,bravery at its finest
2,1,your order comes to $37.50 and your white priv...
3,1,it is time.. to send these parasites back to t...
4,1,mississippi wind chime


In [None]:
# functions to pre process text data : 
# 1. remove stopwords
# 2. remove punctuation

def stop_words(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: ' '.join([item for item in x.split() if item not in stopwords.words('english')]))
  return df

def punctuation(df, column, new_column):
  df[new_column]=df[column].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))
  return df


In [None]:
cleaned_train = stop_words(train, 'text', 'cleaned_text')
cleaned_train = punctuation(cleaned_train, 'cleaned_text', 'cleaned_text')
del cleaned_train['text']
cleaned_train.head()

Unnamed: 0,label,cleaned_text
0,0,character color matters
1,0,afraid love everyone like ex
2,0,putting bows pet
3,0,love everything everybody except squirrels hat...
4,0,everybody loves chocolate chip cookies even hi...


In [None]:
cleaned_dev = stop_words(dev, 'text', 'cleaned_text')
cleaned_dev = punctuation(cleaned_dev, 'cleaned_text', 'cleaned_text')
del cleaned_dev['text']
cleaned_dev.head()

Unnamed: 0,label,cleaned_text
0,1,white people shooting range
1,1,bravery finest
2,1,order comes 3750 white privilege discount brin...
3,1,time send parasites back desert
4,1,mississippi wind chime


In [None]:
text_train = cleaned_train.cleaned_text.values
labels_train = cleaned_train.label.values

In [None]:
text_dev = cleaned_dev.cleaned_text.values
labels_dev = cleaned_dev.label.values

In [None]:
# Finding the maximum length
max_len_train = 0

for text in text_train :
  max_len_train = max(max_len_train, len(text))

print('Max sentence length :', max_len_train)

Max sentence length : 336


In [None]:
# Finding the maximum length
max_len_dev = 0

for text in text_dev :
  max_len_dev = max(max_len_dev, len(text))

print('Max sentence length :', max_len_dev)

Max sentence length : 151


In [None]:
max_len = 336

## Tokenization

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=True)

example_text = cleaned_train['cleaned_text'][0]
bert_input = tokenizer(example_text,padding='max_length', max_length = max_len, 
                       truncation=True, return_tensors="pt")

print(example_text)
print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

character color matters
tensor([[ 101, 1959, 2942, 5218,  102,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
    

Création de la classe Dataset qui permet de mettre les datasets avec le texte tokenisé au bon format pour BERT

In [None]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [x for x in df['label']]
        self.texts = [tokenizer(sentence, padding='max_length', max_length = max_len, truncation=True, return_tensors="pt") for sentence in df['cleaned_text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

Création de la classe BertClassifier qui permet la classification avec BERT + ajout d'une couche linéaire pour notre classification binaire 

In [None]:
from torch import nn
from transformers import BertModel #, BertForSequenceClassification

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)  #couche linéaire pour classification binaire
        #self.sigmoid = nn.Sigmoid()   # pb avec la sigmoid : ne fonctionne pas

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        #final_layer = self.sigmoid(linear_output)

        return linear_output

In [None]:
from torch.optim import Adam
from tqdm import tqdm  #  to show a smart progress meter when the model is training


def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.unsqueeze(1).float().to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                #Accuracy
                output = (output>0.5).float()
                acc = (output == train_label).float().sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.unsqueeze(1).float().to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    output = (output>0.5).float()
                    acc = (output == val_label).float().sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 12
model = BertClassifier()
LR = 1e-6

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
batch_size = 16
with tf.device('/device:GPU:0'):
  train(model, cleaned_train, cleaned_dev, LR, EPOCHS)

# exemple d'output pour LR = 1e-6

100%|██████████| 532/532 [07:28<00:00,  1.18it/s]


Epochs: 1 | Train Loss:  0.042                 | Train Accuracy:  0.637                 | Val Loss:  0.048                 | Val Accuracy:  0.500


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 2 | Train Loss:  0.041                 | Train Accuracy:  0.641                 | Val Loss:  0.048                 | Val Accuracy:  0.508


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 3 | Train Loss:  0.039                 | Train Accuracy:  0.644                 | Val Loss:  0.051                 | Val Accuracy:  0.516


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 4 | Train Loss:  0.037                 | Train Accuracy:  0.677                 | Val Loss:  0.051                 | Val Accuracy:  0.524


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 5 | Train Loss:  0.035                 | Train Accuracy:  0.713                 | Val Loss:  0.053                 | Val Accuracy:  0.520


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 6 | Train Loss:  0.032                 | Train Accuracy:  0.742                 | Val Loss:  0.055                 | Val Accuracy:  0.530


In [None]:
batch_size = 16
with tf.device('/device:GPU:0'):
  train(model, cleaned_train, cleaned_dev, LR, EPOCHS)
# exemple d'output pour LR = 1e-5

100%|██████████| 532/532 [07:29<00:00,  1.18it/s]


Epochs: 1 | Train Loss:  0.039                 | Train Accuracy:  0.673                 | Val Loss:  0.052                 | Val Accuracy:  0.510


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 2 | Train Loss:  0.033                 | Train Accuracy:  0.735                 | Val Loss:  0.051                 | Val Accuracy:  0.532


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 3 | Train Loss:  0.028                 | Train Accuracy:  0.778                 | Val Loss:  0.050                 | Val Accuracy:  0.522


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 4 | Train Loss:  0.022                 | Train Accuracy:  0.804                 | Val Loss:  0.073                 | Val Accuracy:  0.540


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 5 | Train Loss:  0.018                 | Train Accuracy:  0.821                 | Val Loss:  0.091                 | Val Accuracy:  0.544


100%|██████████| 532/532 [07:31<00:00,  1.18it/s]


Epochs: 6 | Train Loss:  0.016                 | Train Accuracy:  0.835                 | Val Loss:  0.097                 | Val Accuracy:  0.544


In [None]:
# Cellule pour regarder un peu comment ça se passe à l'intérieur de la boucle train + à quoi ressemble les input/output/label/mask...
# inutile de faire tourner
from tqdm import tqdm
model = BertClassifier()
train, val = Dataset(cleaned_train), Dataset(cleaned_dev)

train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

loss = nn.BCEWithLogitsLoss()

with tf.device('/device:GPU:0'):
  for train_input, train_label in tqdm(train_dataloader):
    train_label = train_label.unsqueeze(1).float()
    mask = train_input['attention_mask']
    input_id = train_input['input_ids'].squeeze(1)
    output = model(input_id, mask)
    bce = loss(output, train_label)
    print(train_label)
    print(mask)
    print(input_id)
    print(output)
    print(bce.item())
    output = (output>0.5).float()
    print(output)
    acc = (output == train_label).float().sum().item()
    print(acc)