In [1]:
!pip install torch



In [2]:
import torch
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('smile-annotations-final.csv',
                names=['id', 'text', 'category'])
df.set_index('id', inplace=True)

In [4]:
df

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy
...,...,...
613678555935973376,MT @AliHaggett: Looking forward to our public ...,happy
613294681225621504,@britishmuseum Upper arm guard?,nocode
615246897670922240,@MrStuchbery @britishmuseum Mesmerising.,happy
613016084371914753,@NationalGallery The 2nd GENOCIDE against #Bia...,not-relevant


In [5]:
set(df.category)

{'angry',
 'disgust',
 'disgust|angry',
 'happy',
 'happy|sad',
 'happy|surprise',
 'nocode',
 'not-relevant',
 'sad',
 'sad|angry',
 'sad|disgust',
 'sad|disgust|angry',
 'surprise'}

In [6]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [7]:
possible_labels=df.category.unique()

In [8]:
possible_labels

array(['nocode', 'happy', 'not-relevant', 'angry', 'disgust|angry',
       'disgust', 'happy|surprise', 'sad', 'surprise', 'happy|sad',
       'sad|disgust', 'sad|angry', 'sad|disgust|angry'], dtype=object)

In [9]:
df = df[df.category.isin(['happy', 'not-relevant', 'angry', 'surprise', 'sad', 'disgust'])] #change

In [10]:
possible_labels=df.category.unique()

In [11]:
possible_labels

array(['happy', 'not-relevant', 'angry', 'disgust', 'sad', 'surprise'],
      dtype=object)

In [12]:
# we defien number for each category

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [13]:
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [14]:
df.category = df['category'].map(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.category = df['category'].map(label_dict)


In [15]:
df.category.unique()

array([0, 1, 2, 3, 4, 5])

In [16]:
df.head(15)


Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,0
614877582664835073,@Sofabsports thank you for following me back. ...,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,0
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,0
613601881441570816,Yr 9 art students are off to the @britishmuseu...,0
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,1
610746718641102848,#AskTheGallery Have you got plans to privatise...,1
612648200588038144,@BarbyWT @britishmuseum so beautiful,0


## 2: Training/Validation Split

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.category.values,
                                                  test_size=0.15,
                                                  random_state=42,
                                                  stratify=df.category.values)     #  based on row associated with category

In [19]:
len(df)

1481

In [20]:
df['data_type'] = ['not_set']*df.shape[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['data_type'] = ['not_set']*df.shape[0]


In [21]:
df.head()

Unnamed: 0_level_0,text,category,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,0,not_set
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,0,not_set
614877582664835073,@Sofabsports thank you for following me back. ...,0,not_set
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,0,not_set
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,0,not_set


In [22]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [23]:
df.head(15)

Unnamed: 0_level_0,text,category,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,0,train
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,0,train
614877582664835073,@Sofabsports thank you for following me back. ...,0,train
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,0,train
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,0,train
614499696015503361,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,0,train
613601881441570816,Yr 9 art students are off to the @britishmuseu...,0,train
613696526297210880,@RAMMuseum Please vote for us as @sainsbury #s...,1,val
610746718641102848,#AskTheGallery Have you got plans to privatise...,1,train
612648200588038144,@BarbyWT @britishmuseum so beautiful,0,train


In [24]:
df.groupby(['category', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
category,data_type,Unnamed: 2_level_1
0,train,966
0,val,171
1,train,182
1,val,32
2,train,48
2,val,9
3,train,5
3,val,1
4,train,27
4,val,5


In [None]:
# Now we are encoding text to token by using bert tokenizer

# 3. Loading Tokenizer and Encoding our Data

In [25]:
!pip install transformers



In [26]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [27]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', #bert-base-uncased using small bert model for simple data , bert-large-uncased fo large data
    do_lower_case=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Now we send our train test data to tokenize

In [28]:
encoded_data_train = tokenizer.batch_encode_plus(

    df[df.data_type=='train'].text.values,        # IV  here which is text
    add_special_tokens=True,                       # for special token
    return_attention_mask=True,                     # for imp word
    pad_to_max_length=True,                          # maxm  padding
    max_length=256,                                   # max length of sentence
    return_tensors='pt'                      # pt is particular dataformat for tensor , bert accept data interm of tensor
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# through that token we require 'input_ids'  'attention_mask'

In [29]:

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].category.values)

In [30]:
input_ids_train

tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
        [  101,  1030, 27034,  ...,     0,     0,     0],
        [  101,  1030, 10682,  ...,     0,     0,     0],
        ...,
        [  101, 11047,  1030,  ...,     0,     0,     0],
        [  101,  1030,  3680,  ...,     0,     0,     0],
        [  101,  1030,  2120,  ...,     0,     0,     0]])

In [None]:
# in tensor dataset our whole info for training  , val get store in one variable

In [31]:
#TensorDataset create a single variable which stores info of inputids , attentionmsk, labels

dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                           labels_val)

In [32]:
len(dataset_train)

1258

In [33]:
dataset_train.tensors

(tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
         [  101,  1030, 27034,  ...,     0,     0,     0],
         [  101,  1030, 10682,  ...,     0,     0,     0],
         ...,
         [  101, 11047,  1030,  ...,     0,     0,     0],
         [  101,  1030,  3680,  ...,     0,     0,     0],
         [  101,  1030,  2120,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0,  ..., 0, 0, 1]))

In [None]:
# Now we converted our text into number
# now we require bert classification model

# 4. Setting up BERT Pretrained Model

In [34]:
from transformers import BertForSequenceClassification         # classification bert model

In [35]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased',
                                      num_labels = len(label_dict),#6       # <-- here we have to mention our count dv category
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# now inorder to  pass data interm of particular bert we require data_loaders , it load data batchwise and load data randomly

# to feed our data to model we require data loaders

# 5. Creating Data Loaders

In [36]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [37]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

# 6. Setting Up Optimizer and Scheduler

In [38]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [39]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [40]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer,                                           # here it is try to give best weight
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

# 7. Defining our Performance Metrics

In [41]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# now we have to use prewritten code

In [42]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [43]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

# 8. Creating our Training Loop

In [44]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [46]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [47]:
for epoch in tqdm(range(1, epochs+1)):
    model.train() #forward propagation
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward() #backwardprop

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    torch.save(model, f'BERT_ft_Epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')


  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.7577714336178606


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5771544235093253
F1 Score (weighted): 0.7846916359543217


Epoch 2:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.4796300903760961


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5118397559438433
F1 Score (weighted): 0.802006915437987


Epoch 3:   0%|          | 0/315 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.3640910215409739


  0%|          | 0/7 [00:00<?, ?it/s]

Validation loss: 0.5850088042872292
F1 Score (weighted): 0.8205051240125366


In [None]:
# always use model which has best f1 score

# Evalution

In [48]:
headline="The sun slowly descended behind the mountains, casting a warm golden hue across the tranquil lake."

In [49]:
from transformers import BertTokenizer

In [50]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [51]:
device = torch.device('cpu')

In [52]:
print(device)

cpu


In [53]:
encoded_headline = tokenizer(headline, return_tensors = 'pt')

In [54]:
encoded_headline

{'input_ids': tensor([[  101,  1996,  3103,  3254,  9287,  2369,  1996,  4020,  1010,  9179,
          1037,  4010,  3585, 20639,  2408,  1996, 25283, 26147,  2697,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# we require two things Ip id  and attension mask # bcoz we train it one that two thing
# we require model to predict from these

In [55]:
input_ids = encoded_headline['input_ids'].to(device)
attention_msk = encoded_headline['attention_mask'].to(device)

In [56]:
# take the model which have best f1 score

In [58]:
'/content/BERT_ft_Epoch3.model'   # <-- best model

'/content/BERT_ft_Epoch3.model'

In [59]:
path = '/content/BERT_ft_Epoch3.model'
model = torch.load(path, map_location = torch.device('cpu'))

In [60]:
model   # internally model train on huge parameter , it create various layer bcoz it is pretrained archeiture

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# in model we sending our ip id and attention mask which will performe good role to predict

In [61]:
model_output = model(input_ids,attention_msk)

In [62]:
model_output

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.0402,  0.2587, -1.2243, -0.8876, -0.6022, -0.8207]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
#  our final output it is in  "logits" so we take only logits

In [63]:
model_output_tensor = torch.tensor(model_output.logits)

  model_output_tensor = torch.tensor(model_output.logits)


In [64]:
model_output_tensor  # which ever have higher val these label is our final op

tensor([[ 2.0402,  0.2587, -1.2243, -0.8876, -0.6022, -0.8207]])

In [65]:
model_output_tensor_categoryIndex = int(torch.argmax(model_output_tensor))

In [66]:
model_output_tensor_categoryIndex

0

In [67]:
classes = { 0:'happy' ,
 1:'not-relevant',
  2:'angry',
  3:'disgust',
  4:'sad',
  5:'surprise'}

In [68]:
classes[model_output_tensor_categoryIndex]

'happy'