<a href="https://colab.research.google.com/github/kla55/Pytorch_learning/blob/main/Sentiment_analysis_with_BERT/Sentiment_analysis_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

pd.options.mode.chained_assignment = None

In [2]:
dset_smile_url='https://archive.org/download/misc-dataset/smile-annotations-final.csv'
df=pd.read_csv(dset_smile_url,header=None)
df.columns=['id','text','category']
# df.set_index('id', inplace=True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3085 entries, 0 to 3084
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3085 non-null   int64 
 1   text      3085 non-null   object
 2   category  3085 non-null   object
dtypes: int64(1), object(2)
memory usage: 72.4+ KB


Unnamed: 0,id,text,category
0,611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy
...,...,...,...
3080,613678555935973376,MT @AliHaggett: Looking forward to our public ...,happy
3081,613294681225621504,@britishmuseum Upper arm guard?,nocode
3082,615246897670922240,@MrStuchbery @britishmuseum Mesmerising.,happy
3083,613016084371914753,@NationalGallery The 2nd GENOCIDE against #Bia...,not-relevant


In [3]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [4]:
df = df[~df.category.str.contains('\|')]
df = df[df.category != 'nocode']

In [5]:
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [6]:
possible_labels = df.category.unique()

In [7]:


label_encoder = preprocessing.LabelEncoder()
df['label']= label_encoder.fit_transform(df['category'])
df

Unnamed: 0,id,text,category,label
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,2
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,2
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy,2
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,2
5,611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,2
...,...,...,...,...
3078,611258135270060033,@_TheWhitechapel @Campaignforwool @SlowTextile...,not-relevant,3
3079,612214539468279808,“@britishmuseum: Thanks for ranking us #1 in @...,happy,2
3080,613678555935973376,MT @AliHaggett: Looking forward to our public ...,happy,2
3082,615246897670922240,@MrStuchbery @britishmuseum Mesmerising.,happy,2


In [8]:
print(df['label'].unique())
print(df['category'].unique())

[2 3 0 1 4 5]
['happy' 'not-relevant' 'angry' 'disgust' 'sad' 'surprise']


In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size = 0.15,
    random_state = 17,
    stratify = df.label.values
)

In [10]:
df['data_type'] = 'not_set'
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [11]:
df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,text
category,label,data_type,Unnamed: 3_level_1,Unnamed: 4_level_1
angry,0,train,48,48
angry,0,val,9,9
disgust,1,train,5,5
disgust,1,val,1,1
happy,2,train,966,966
happy,2,val,171,171
not-relevant,3,train,182,182
not-relevant,3,val,32,32
sad,4,train,27,27
sad,4,val,5,5


In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [14]:
tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [15]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    truncation=True,
    padding=True,
    max_length = 256,
    return_tensors = 'pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    truncation=True,
    padding=True,
    max_length = 256,
    return_tensors = 'pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [29]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = df['category'].nunique(),
    output_attentions = False,
    output_hidden_states = False
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
batch_size = 4 #32
dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),
    batch_size = 32
)

dataloader_val = DataLoader(
    dataset_val,
    sampler = SequentialSampler(dataset_val),
    batch_size = 32
)

In [31]:
optimizer = AdamW(
    model.parameters(),
    lr = 2e-5,
    eps = 1e-8
)



In [32]:
epochs = 10
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)


In [34]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average="weighted")

def accuracy_per_class(preds, labels):
    labels_dict_inverse = {v: k for k,v in label_dict.items()}
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print('Class: {}'.format(labels_dict_inverse[label]))
        print('Accuracy: {}\n'.format( len(y_preds[y_preds == label]) / len(y_true)) )


In [35]:


seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [39]:
pprint(dataloader_val)

Pretty printing has been turned OFF


In [45]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()

    loss_train_total = 0
    progress_bar = tqdm(dataloader_train,
                        desc="Epoch {:1d}".format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids'       : batch[0],
            'attention_mask'  : batch[1],
            'labels'          : batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix( {'training_loss': '{:3f}'.format(loss.item() / len(batch))} )

    # torch.save(model.state_dict(), 'Models/BERT_ft_epoch{}.model'.format(epoch))
    # tqdm.write('\nEpoch {epoch}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write('Training loss: {}'.format(loss_train_avg))

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)

    tqdm.write('Validation loss: {}'.format(val_loss))
    tqdm.write('f1 score (weighted): {}'.format(val_f1))

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Training loss: 0.06332377216313034


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5986340514251164
f1 score (weighted): 0.8383114832492714


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Training loss: 0.06372757719364017


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5986340514251164
f1 score (weighted): 0.8383114832492714


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Training loss: 0.06438940591178835


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5986340514251164
f1 score (weighted): 0.8383114832492714


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


Training loss: 0.06262137508019805


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5986340514251164
f1 score (weighted): 0.8383114832492714


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
