#Build dataset

In [5]:
import pandas as pd
import re
import string
from sklearn.model_selection import train_test_split

In [6]:
dataset_path = "/content/drive/My Drive/coding/data/unique_tweets_7k.csv"

In [7]:
def load_dataset(path):
  data = pd.read_csv(path)
  tweet = data.drop_duplicates(subset=["text"])
  tweet = tweet[["text","sentiment"]].dropna()
  return tweet

In [8]:
def data_cleaning(df):
  def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)
  
  def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)
  
  def remove_emoji(text):
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
  
  def remove_punct(text):
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)

  def remove_multi_spaces(text):
        space = re.compile(' +')
        line = re.compile('\n')
        return space.sub(r' ', line.sub(r' ', text))

  def remove_hashtags_mentions(text):
        hashtags = re.compile(r"^#\S+|\s#\S+")
        mentions = re.compile(r"^@\S+|\s@\S+")
        text = hashtags.sub(' hashtag', text)
        text = mentions.sub(' entity', text)
        return text.strip().lower()

  df.text = df.text.apply(lambda x: remove_URL(x))
  df.text = df.text.apply(lambda x: remove_html(x))
  df.text = df.text.apply(lambda x: remove_emoji(x))
  df.text = df.text.apply(lambda x: remove_punct(x))
  df.text = df.text.apply(lambda x: remove_multi_spaces(x))
  df.text = df.text.apply(lambda x: remove_hashtags_mentions(x))
  return df

In [9]:
def balance_data(df):
  df = df.drop(df.query('sentiment == 0').sample(frac=0.7).index)
  df = df.drop(df.query('sentiment == 4').sample(frac=0.6).index)
  return df

In [10]:
def set_split(df, test_size = 0.2):
    train, test = train_test_split(df, test_size = test_size, random_state = 42)
    return train, test

In [11]:
def prepare_train_test_from_file(path):
    tweets = load_dataset(path)
    tweets = data_cleaning(tweets)
    tweets = balance_data(tweets)
    return set_split(tweets)

In [12]:
tweets = load_dataset(dataset_path)
tweets = data_cleaning(tweets)
tweets = balance_data(tweets)
train, test = set_split(tweets)

In [13]:
train.head()

Unnamed: 0,text,sentiment
3412,what has happened to the userid userid wear a ...,1.0
1496,holy fuck this day got bad canceled debit card...,2.0
2192,lol my tl real quiet today since lebron won hi...,1.0
1239,fuck you believe bill down the do blessed brot...,0.0
612,anybody wanna go half on rent and fuck afterwo...,0.0


In [14]:
len(train)

1286

In [15]:
len(test)

322

In [41]:
train = pd.DataFrame(train).reset_index()

In [44]:
test = pd.DataFrame(test).reset_index()

In [61]:
test['sentiment'] = test['sentiment'].astype(int)

In [62]:
train['sentiment'] = train['sentiment'].astype(int)

#load tokenizer adn encoding data

In [2]:
!pip install transformers
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [46]:
# Print the original sentence.
print(' Original: ', train.text[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train.text[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train.text[0])))

 Original:  what has happened to the userid userid wear a damn mask and respect the health of those around you userid stop the spread of covid userid i bet someone in this pic has it userid sad url
Tokenized:  ['what', 'has', 'happened', 'to', 'the', 'user', '##id', 'user', '##id', 'wear', 'a', 'damn', 'mask', 'and', 'respect', 'the', 'health', 'of', 'those', 'around', 'you', 'user', '##id', 'stop', 'the', 'spread', 'of', 'co', '##vid', 'user', '##id', 'i', 'bet', 'someone', 'in', 'this', 'pic', 'has', 'it', 'user', '##id', 'sad', 'ur', '##l']
Token IDs:  [2054, 2038, 3047, 2000, 1996, 5310, 3593, 5310, 3593, 4929, 1037, 4365, 7308, 1998, 4847, 1996, 2740, 1997, 2216, 2105, 2017, 5310, 3593, 2644, 1996, 3659, 1997, 2522, 17258, 5310, 3593, 1045, 6655, 2619, 1999, 2023, 27263, 2038, 2009, 5310, 3593, 6517, 24471, 2140]


In [20]:
import torch

In [48]:
max_len = 0

# For every sentence...
for sent in train.text:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  167


In [63]:
encoded_data_train = tokenizer.batch_encode_plus(
    train.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensor ='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword arguments {'return_tensor': 'pt'} not recognized.
Keyword argume

In [64]:
input_ids_train = torch.tensor(encoded_data_train['input_ids'])
attention_masks_train = torch.tensor(encoded_data_train['attention_mask'])
labels_train = torch.tensor(train.sentiment.values)

input_ids_test = torch.tensor(encoded_data_test['input_ids'])
attention_masks_test = torch.tensor(encoded_data_test['attention_mask'])
labels_test = torch.tensor(test.sentiment.values)

  input_ids_test = torch.tensor(encoded_data_test['input_ids'])
  attention_masks_test = torch.tensor(encoded_data_test['attention_mask'])


In [65]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_test = TensorDataset(input_ids_test, 
                            attention_masks_test,
                           labels_test)

In [66]:
dataset_test.tensors

(tensor([[ 101, 5310, 3593,  ...,    0,    0,    0],
         [ 101, 5310, 3593,  ...,    0,    0,    0],
         [ 101, 3109, 2030,  ...,    0,    0,    0],
         ...,
         [ 101, 2748, 2138,  ...,    0,    0,    0],
         [ 101, 2023, 2003,  ...,    0,    0,    0],
         [ 101, 2681, 2033,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 1, 0, 0, 2, 0, 0, 1, 4, 0, 4, 4, 4, 4, 4, 0, 0, 1, 4, 4, 0, 4, 1, 1,
         4, 0, 3, 4, 4, 0, 4, 3, 2, 1, 4, 4, 0, 0, 2, 1, 1, 4, 0, 4, 1, 0, 0, 0,
         4, 4, 0, 0, 1, 4, 0, 0, 1, 1, 4, 0, 4, 4, 2, 0, 0, 1, 1, 1, 1, 2, 4, 4,
         1, 1, 4, 0, 1, 4, 3, 4, 1, 4, 4, 1, 2, 1, 0, 2, 4, 2, 4, 3, 1, 4, 2, 2,
         0, 4, 1, 0, 1, 2, 1, 4, 4, 4, 0, 4, 1, 0, 1, 1, 1, 4, 0, 4, 0, 4, 1, 0,
         0, 2, 0, 2, 4, 4, 0, 4, 2,

#build BERT pretrained model

In [67]:
from transformers import BertForSequenceClassification

In [72]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 5,
    output_attentions = False,
    output_hidden_states = False
)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [73]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [74]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_test = DataLoader(
    dataset_test,
    sampler=RandomSampler(dataset_test),
    batch_size=batch_size
)

In [75]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [76]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [77]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

In [86]:
import numpy as np
from sklearn.metrics import f1_score

In [87]:
def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average='weighted')

In [79]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [80]:
from tqdm.notebook import tqdm

In [83]:
import numpy as np

In [84]:
def evaluate(dataloader_test):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_test):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_test) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [88]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_test)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.32852701671994206


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.43048501943157597
F1 Score (weighted): 0.8606270834623789


Epoch 2:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.19206949663406342


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.5294446125540331
F1 Score (weighted): 0.8728874967801652


Epoch 3:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.1070070486701591


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.5402724408159424
F1 Score (weighted): 0.878518840803957


Epoch 4:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.058221638900074556


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.5660974960026826
F1 Score (weighted): 0.8952998787609494


Epoch 5:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.03216147205433721


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.5932992152283313
F1 Score (weighted): 0.8890191373063946


Epoch 6:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.018764605679511676


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.6111976297879103
F1 Score (weighted): 0.8795464131598585


Epoch 7:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.015719101481538798


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.6306773815214214
F1 Score (weighted): 0.8742190351381632


Epoch 8:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.010863384865340053


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.634346254948199
F1 Score (weighted): 0.8738242384091961


Epoch 9:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.007728069440406741


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.6344012852595045
F1 Score (weighted): 0.8738242384091961


Epoch 10:   0%|          | 0/322 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.007198209174936175


  0%|          | 0/81 [00:00<?, ?it/s]

Validation loss: 0.6343447566804918
F1 Score (weighted): 0.8738242384091961


In [91]:
sentiments_dict = {0: "positive", 1: "sad", 2: "angry", 3: "fear", 4: "sarcasm"}

In [94]:
def accuracy_per_class(preds, labels):
    #label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {sentiments_dict[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [95]:
accuracy_per_class(predictions, true_vals)

Class: positive
Accuracy:65/79

Class: sad
Accuracy:74/86

Class: angry
Accuracy:22/33

Class: fear
Accuracy:16/19

Class: sarcasm
Accuracy:104/105

