In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
df=pd.read_csv("smile-annotations-final.csv",names=["id","text","category"])

In [3]:
df.head()

Unnamed: 0,id,text,category
0,611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,614877582664835073,@Sofabsports thank you for following me back. ...,happy
4,611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [4]:
df["category"].value_counts()

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

In [5]:
df=df[["text","category"]]

In [6]:
df.head()

Unnamed: 0,text,category
0,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,@Sofabsports thank you for following me back. ...,happy
4,@britishmuseum @TudorHistory What a beautiful ...,happy


In [7]:
df["category"].value_counts()

category
nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: count, dtype: int64

In [8]:
df.isnull().sum()

text        0
category    0
dtype: int64

In [9]:
selected_categories=['happy', 'not-relevant', 'angry', 'surprise', 'sad', 'disgust']

In [10]:
df = df[df["category"].isin(selected_categories)]

In [11]:
df["category"].value_counts()

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [12]:
df.head(10)

Unnamed: 0,text,category
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
2,@SelectShowcase @Tate_StIves ... Replace with ...,happy
3,@Sofabsports thank you for following me back. ...,happy
4,@britishmuseum @TudorHistory What a beautiful ...,happy
5,@NationalGallery @ThePoldarkian I have always ...,happy
9,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy
12,Yr 9 art students are off to the @britishmuseu...,happy
15,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant
16,#AskTheGallery Have you got plans to privatise...,not-relevant
18,@BarbyWT @britishmuseum so beautiful,happy


In [13]:
df["category"].value_counts()

category
happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: count, dtype: int64

In [14]:
possible_labels = df.category.unique()

In [15]:
possible_labels

array(['happy', 'not-relevant', 'angry', 'disgust', 'sad', 'surprise'],
      dtype=object)

# This is used to convert categorical value to numerical values. Alternatives of LabelEncoder, OneHotEncoding etc.

In [16]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

###### The enumerate function is used to iterate over the (possible_labels) list along with its index. It returns pairs of index and corresponding values. For each pair (index, possible_label), the code assigns the label (possible_label) as the key in the dictionary (label_dict), and the index (index) as the corresponding value.

In [17]:
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

###### map() function replaces the values in the 'category' column with the corresponding values from a dictionary (label_dict). 

In [18]:
df.category = df['category'].map(label_dict)

In [19]:
df.head()

Unnamed: 0,text,category
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,0
2,@SelectShowcase @Tate_StIves ... Replace with ...,0
3,@Sofabsports thank you for following me back. ...,0
4,@britishmuseum @TudorHistory What a beautiful ...,0
5,@NationalGallery @ThePoldarkian I have always ...,0


In [20]:
X=df.index.values
Y=df["category"].values

In [21]:
print(df.index.isin(X).all())

True


In [22]:
from sklearn.model_selection import train_test_split

###### The stratify parameter in the train_test_split function is used to ensure that the distribution of the target variable (in this case, the 'category' column) is preserved in both the training and validation sets. This is particularly important when dealing with imbalanced classes.

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X,Y,test_size=0.15,random_state=42,
                                                  stratify=df.category.values)

In [24]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(1258,)
(1258,)
(223,)
(223,)


In [25]:
print(df.index.isin(X_train).all())

False


In [26]:
print(df.index)

Index([   1,    2,    3,    4,    5,    9,   12,   15,   16,   18,
       ...
       3068, 3070, 3071, 3076, 3077, 3078, 3079, 3080, 3082, 3083],
      dtype='int64', length=1481)


In [27]:
df['data_type'] = ['not_set']*df.shape[0]

In [28]:
df.loc[X_train, 'data_type'] = 'train'
print(X_train)

[2934 2882 1791 ...  632 1311 2844]


In [29]:
df.loc[X_val, 'data_type'] = 'val'
print(X_val)

[2875 1843 1928 1372  436  322 1954 2349 2671 1676   98  152 2902  366
   33 1489 2150 1379 1668 2333  889 1426 1907 2136 1702 2115  592 2727
 3054 1277   36  250 2751  650  556 1809  937 3076 1591  523  978 1296
 2993 1865 3019  357 1507 1628   92 2381  832 2929 1014 1990  672  743
  866  999 2071  445 1560 3021 1252 1998  486 1720  226  872 2701 1328
  883 1714  657 3051 1332  175 3068 3064 1066 1667 1032 1696  574 2842
 1001 1636  462 1250 2431  647  942 2785 2140 1453 1517  980 1812 2799
 1996 2796 2722  527 1943 2127 1336 2781 2199 1020   52 2724 1015 1853
 3070  696 2610  239  734 1475 1818 1596  495  624 1845 3014  988 1007
 1980 1772 1174   69  103 2805 1208 1483 2551  114  753 1045  255  589
  826 1923 2052 2152  245  676 2619 2078  262  579 1212 1254  500 1480
 2170 1079 3020 2353 2087  497 2343 2267 1621 1844  345 2822 2681 1318
 2254  533 1656 1401 2274 2779 2363  772   15 1387  899 2605   55  368
 1452 1472 1196 1476 2684 2118 2133 1762 1924 2454  913  621  230 1807
 2450 

In [30]:
df.head()

Unnamed: 0,text,category,data_type
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,0,train
2,@SelectShowcase @Tate_StIves ... Replace with ...,0,train
3,@Sofabsports thank you for following me back. ...,0,train
4,@britishmuseum @TudorHistory What a beautiful ...,0,train
5,@NationalGallery @ThePoldarkian I have always ...,0,train


In [31]:
df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,data_type,Unnamed: 2_level_1
0,train,966
0,val,171
1,train,182
1,val,32
2,train,48
2,val,9
3,train,5
3,val,1
4,train,27
4,val,5


In [32]:
df

Unnamed: 0,text,category,data_type
1,Dorian Gray with Rainbow Scarf #LoveWins (from...,0,train
2,@SelectShowcase @Tate_StIves ... Replace with ...,0,train
3,@Sofabsports thank you for following me back. ...,0,train
4,@britishmuseum @TudorHistory What a beautiful ...,0,train
5,@NationalGallery @ThePoldarkian I have always ...,0,train
...,...,...,...
3078,@_TheWhitechapel @Campaignforwool @SlowTextile...,1,train
3079,“@britishmuseum: Thanks for ranking us #1 in @...,0,train
3080,MT @AliHaggett: Looking forward to our public ...,0,train
3082,@MrStuchbery @britishmuseum Mesmerising.,0,train


In [33]:
from transformers import BertTokenizer





###### "TensorDataset" is a PyTorch dataset wrapper that allows you to create a dataset from a list of tensors. It's commonly used for creating datasets for training and validation in PyTorch.


In [34]:
from torch.utils.data import TensorDataset

In [35]:
tokenizer=BertTokenizer.from_pretrained("bert-base-uncased")

In [65]:
tokenizer.vocab_size

30522

In [36]:
# Filter the DataFrame based on the condition
train_texts = df[df["data_type"] == 'train'].text.values

In [37]:
len(train_texts)

1258

# Preparing Training and Validation Dataset

###### Prepare training , validation data  --> convert reviews into tokens --> convert tokens into token ids --> set max len --> creating padding --> all this will done by tokenizer.encode_plus


In [38]:
encoded_train_data=tokenizer.batch_encode_plus(train_texts,
                                              add_special_tokens=True,
                                              return_attention_mask=True,
                                              pad_to_max_length=True,
                                              max_length=256,
                                              return_tensors="pt"
                                              )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [39]:
print(encoded_train_data)

{'input_ids': tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
        [  101,  1030, 27034,  ...,     0,     0,     0],
        [  101,  1030, 10682,  ...,     0,     0,     0],
        ...,
        [  101, 11047,  1030,  ...,     0,     0,     0],
        [  101,  1030,  3680,  ...,     0,     0,     0],
        [  101,  1030,  2120,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [40]:
encoded_val_data=tokenizer.batch_encode_plus(df[df["data_type"]=="val"].text.values,
                                            add_special_tokens=True,
                                            return_attention_mask=True,
                                            pad_to_max_length=True,
                                            max_length=256,
                                            return_tensors="pt")

In [41]:
input_train_ids=torch.tensor(encoded_train_data["input_ids"])
train_attention_mask=torch.tensor(encoded_train_data["attention_mask"])
train_labels=torch.tensor(df[df.data_type=="train"].category.values)

  input_train_ids=torch.tensor(encoded_train_data["input_ids"])
  train_attention_mask=torch.tensor(encoded_train_data["attention_mask"])


In [42]:
print(input_train_ids)

tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
        [  101,  1030, 27034,  ...,     0,     0,     0],
        [  101,  1030, 10682,  ...,     0,     0,     0],
        ...,
        [  101, 11047,  1030,  ...,     0,     0,     0],
        [  101,  1030,  3680,  ...,     0,     0,     0],
        [  101,  1030,  2120,  ...,     0,     0,     0]])


In [43]:
print(train_attention_mask)

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [44]:
input_val_ids=torch.tensor(encoded_val_data["input_ids"])
val_attention_mask=torch.tensor(encoded_val_data["attention_mask"])
val_labels=torch.tensor(df[df.data_type=="val"].category.values)

  input_val_ids=torch.tensor(encoded_val_data["input_ids"])
  val_attention_mask=torch.tensor(encoded_val_data["attention_mask"])


In [45]:
print(input_train_ids.shape)
print(train_attention_mask.shape)
print(train_labels.shape)

torch.Size([1258, 256])
torch.Size([1258, 256])
torch.Size([1258])


In [46]:
print(input_val_ids.shape)
print(val_attention_mask.shape)
print(val_labels.shape)

torch.Size([223, 256])
torch.Size([223, 256])
torch.Size([223])


In [47]:
print(val_labels)

tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 4, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 4, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 4, 0, 2, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 2, 1, 1, 0, 3, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        5, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 4, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0,
        5, 0, 0, 0, 0, 0, 0])


In [48]:
train_dataset=TensorDataset(input_train_ids,
                            train_attention_mask,
                            train_labels)

In [49]:
val_dataset=TensorDataset(input_val_ids,
                          val_attention_mask,
                          val_labels)

In [50]:
print(len(train_dataset))
print(len(val_dataset))

1258
223


In [51]:
train_dataset.tensors

(tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
         [  101,  1030, 27034,  ...,     0,     0,     0],
         [  101,  1030, 10682,  ...,     0,     0,     0],
         ...,
         [  101, 11047,  1030,  ...,     0,     0,     0],
         [  101,  1030,  3680,  ...,     0,     0,     0],
         [  101,  1030,  2120,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0,  ..., 0, 0, 1]))

# Setting BERT Pre-trained Model

In [52]:
from transformers import BertForSequenceClassification

In [53]:
Model=BertForSequenceClassification.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Creating Data Loaders

In [54]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [55]:
train_dataloaders=DataLoader(train_dataset,
                            batch_size=64,
                            sampler=RandomSampler(train_dataset))

In [56]:
val_dataloaders=DataLoader(val_dataset,
                          batch_size=32,
                          sampler=RandomSampler(val_dataset))

# Setting Up Optimizer and Scheduler

In [57]:
from transformers import AdamW,get_linear_schedule_with_warmup

2024-01-20 22:55:32.092581: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-20 22:55:32.694849: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [58]:
optimizer=AdamW(Model.parameters(),
               lr=1e-5,
               eps=1e-8)



In [59]:
epochs=10
scheduler=get_linear_schedule_with_warmup(optimizer,
                                         num_warmup_steps=0,
                                         num_training_steps=len(train_dataloaders)*epochs)

In [60]:
from sklearn.metrics import f1_score

In [61]:
def f1_score_func(preds,labels):
    preds_flat=np.argmax(preds,axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score_func(labels_flat,preds_flat,average="weighted")

In [62]:
def accuracy_per_class(preds,labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

# Creating Training Loop

In [63]:
import random

In [64]:
seed_val=100
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [709]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
Model.to(device)
print(device)

cpu


In [710]:
def evaluate(val_dataloaders):

    Model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(val_dataloaders):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(val_dataloaders) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals



In [711]:
from tqdm.notebook import tqdm

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    Model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(train_dataloaders, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        Model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = Model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(Model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(train_dataloaders)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(val_dataloaders)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/20 [00:00<?, ?it/s]

In [1]:
accuracy_per_class(predictions, true_vals)

NameError: name 'accuracy_per_class' is not defined

In [None]:
help(get_linear_schedule_with_warmup)