# BERT - Twitter Sentiment Classifier

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 59.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 71.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
from transformers import BertTokenizer, BertConfig,AdamW, BertForSequenceClassification,get_linear_schedule_with_warmup


In [None]:

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import accuracy_score,matthews_corrcoef

from tqdm import tqdm, trange,tnrange,tqdm_notebook
import random
import os
import io


In [None]:
# identify and specify the GPU as the device, later in training loop we will load data into device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

SEED = 19

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if device == torch.device("cuda"):
    torch.cuda.manual_seed_all(SEED)



In [None]:
df_train = pd.read_csv("/content/fullData.csv")

In [None]:
df_train.isnull().sum()

username     0
age          0
depressed    0
count        0
tweets       0
dtype: int64

#### Observation - Requires data cleaning

In [None]:
df_train.head()

Unnamed: 0,username,age,depressed,count,tweets
0,MadsSheahan,21,0,667,hollywood insider post reliable gossip celebri...
1,squeakehsaurus,24,1,126,thats big old yikes see later babe sorry life ...
2,lauren_paige100,15,1,194,something customers come my work make me my jo...
3,Shamzz92,20,1,1156,best feel cap get attention fahima smile face ...
4,ChrisDooks,30,0,146,awesome news well do use hcx face vmotion erro...


## Target Distribution

In [None]:
df_train['depressed'].unique()

array([0, 1])

In [None]:
df_train['depressed'].value_counts()

1    378
0    307
Name: depressed, dtype: int64

# Data cleaning

### Ignoring the null values

In [None]:
df_train = df_train[~df_train['depressed'].isnull()]

In [None]:
df_train = df_train[~df_train['tweets'].isnull()]

# Target Encodeing

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df_train['category_1'] = labelencoder.fit_transform(df_train['depressed'])

In [None]:
df_train[['depressed','category_1']].drop_duplicates(keep='first')

Unnamed: 0,depressed,category_1
0,0,0
1,1,1


In [None]:
df_train.rename(columns={'category_1':'label'},inplace=True)

# Data Preperation for BERT model

In [None]:
## create label and sentence list
sentences = df_train.tweets.values

#check distribution of data based on labels
print("Distribution of data based on labels: ",df_train.label.value_counts())

# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway. 
# In the original paper, the authors used a length of 512.
MAX_LEN = 256

## Import BERT tokenizer, that is used to convert our text into tokens that corresponds to BERT library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)

Distribution of data based on labels:  1    378
0    307
Name: label, dtype: int64


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
input_ids = [tokenizer.encode(sent, add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True,truncation=True) for sent in sentences]



In [None]:

labels = df_train.label.values

print("Actual sentence before tokenization: ",sentences[2])
print("Encoded Input from dataset: ",input_ids[2])

## Create attention mask
attention_masks = []
## Create a mask of 1 for all input tokens and 0 for all padding tokens
attention_masks = [[float(i>0) for i in seq] for seq in input_ids]
print(attention_masks[2])

Actual sentence before tokenization:  something customers come my work make me my job sit right me movie hit little hard me grimace face good wrong face tear joyskull opeupside face nobody me every night look wed dress pinterest although i never boyfrientongue stick cheeky playful blow raspberry eyemoutheye one win sparkle chris evans john mulaney two men i would never question permanently pass vibe check sparkle agree thank much fold handslight skin tone since i clinically diagnose depression i say i go big sad let me tell big sad literally kill me happy face smiley please sign donate anything i think everyone i wanna go lake plead facebackhand index point rightbackhand index point leave recur mood i know we really know i hope everything go okayred heart red heart i accept my fate point me oh i get give me plenty time get stuff do my brain ha dumb bitch gonna drive around work listen sad music me brain uh cause i fuckin say happy face smiley hm i felt nobody literally nobody me litera

In [None]:
train_inputs,validation_inputs,train_labels,validation_labels = train_test_split(input_ids,labels,random_state=41,test_size=0.1)
train_masks,validation_masks,_,_ = train_test_split(attention_masks,input_ids,random_state=41,test_size=0.1)

In [None]:
# convert all our data into torch tensors, required data type for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

### Lets see how the training data looks like

In [None]:
train_data[0]

(tensor([  101,  2026,  6045,  2417,  2540,  2417,  2540,  2417,  2540,  2417,
          2540,  6084,  9126, 10376,  4904,  2033, 27767, 21007,  2033,  9333,
          6279, 19707,  4183, 22794,  6826, 13874,  4757,  2034,  9333,  1058,
          2078,  5148, 13469,  2102, 13129, 13469,  2102, 13129, 10126,  1045,
          2467,  3422,  2417,  2540,  2417,  2540,  2417,  2540,  2204,  5302,
          6826,  2075,  2868,  2227,  2540,  2159,  6491, 16281,  2227,  2540,
          2159,  6491, 16281,  2227,  2540,  2159,  6491, 16281,  2227,  2540,
          3239,  6084,  9126, 10376,  4904,  2033,  9333,  6279,  4523,  4183,
          8502,  2026,  3336,  2417,  2540,  2417,  2540,  2417,  2540,  1045,
          2113,  2417,  2540,  2417,  2540,  2417,  2540,  1045,  3080,  2095,
          9928,  5390,  2227, 23743, 18718,  5390,  2227, 23743, 18718,  5390,
          2227,  2033, 27767, 21007,  2293,  3124,  2651,  2026,  5798,  1045,
          4299,  1045,  2272,  5470,  3113, 22794,  

## Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)

# Parameters:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Training & Inference 

In [None]:
## Store our loss and accuracy for plotting
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for _ in tnrange(1,epochs+1,desc='Epoch'):
  print("<" + "="*22 + F" Epoch {_} "+ "="*22 + ">")
  # Calculate total loss for this epoch
  batch_loss = 0

  for step, batch in enumerate(train_dataloader):
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    
    # Backward pass
    loss.backward()
    
    # Clip the norm of the gradients to 1.0
    # Gradient clipping is not in AdamW anymore
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update learning rate schedule
    scheduler.step()

    # Clear the previous accumulated gradients
    optimizer.zero_grad()
    
    # Update tracking variables
    batch_loss += loss.item()

  # Calculate the average loss over the training data.
  avg_train_loss = batch_loss / len(train_dataloader)

  #store the current learning rate
  for param_group in optimizer.param_groups:
    print("\n\tCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\n\tAverage Training loss: {avg_train_loss}')
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_accuracy,eval_mcc_accuracy,nb_eval_steps = 0, 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits[0].to('cpu').numpy()
    label_ids = b_labels.to('cpu').numpy()

    pred_flat = np.argmax(logits, axis=1).flatten()
    labels_flat = label_ids.flatten()
    
    df_metrics=pd.DataFrame({'Epoch':epochs,'Actual_class':labels_flat,'Predicted_class':pred_flat})
    
    tmp_eval_accuracy = accuracy_score(labels_flat,pred_flat)
    tmp_eval_mcc_accuracy = matthews_corrcoef(labels_flat, pred_flat)
    
    eval_accuracy += tmp_eval_accuracy
    eval_mcc_accuracy += tmp_eval_mcc_accuracy
    nb_eval_steps += 1

  print(F'\n\tValidation Accuracy: {eval_accuracy/nb_eval_steps}')
  print(F'\n\tValidation MCC Accuracy: {eval_mcc_accuracy/nb_eval_steps}')

  for _ in tnrange(1,epochs+1,desc='Epoch'):


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]


	Current Learning rate:  1.3333333333333333e-05

	Average Training loss: 0.7486691415309906

	Validation Accuracy: 0.75625

	Validation MCC Accuracy: 0.5362098852867465

	Current Learning rate:  6.666666666666667e-06

	Average Training loss: 0.4950724825263023

	Validation Accuracy: 0.90625

	Validation MCC Accuracy: 0.8210867974758118

	Current Learning rate:  0.0

	Average Training loss: 0.3361028768122196

	Validation Accuracy: 0.7937500000000001

	Validation MCC Accuracy: 0.5145878852163578


In [None]:
from sklearn.metrics import confusion_matrix,classification_report
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
## emotion labels
label2int = {
  "Not depressed": 0,
  "Depressed": 1
}

In [None]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:


result_table = pd.DataFrame(columns=['classifiers', 'fpr', 'tpr', 'auc'])

In [None]:
fpr, tpr, _  = roc_curve(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values)
auc = roc_auc_score(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values)

In [None]:
result_table = result_table.append({'classifiers':"BERTWeek",
                                    'fpr':fpr,
                                    'tpr':tpr,
                                    'auc':auc}, ignore_index=True)

In [None]:
result_table.set_index('classifiers', inplace=True)

In [None]:
print(classification_report(df_metrics['Actual_class'].values, df_metrics['Predicted_class'].values, target_names=label2int.keys(), digits=len(label2int)))

               precision    recall  f1-score   support

Not depressed       0.60      1.00      0.75         3
    Depressed       0.00      0.00      0.00         2

     accuracy                           0.60         5
    macro avg       0.30      0.50      0.37         5
 weighted avg       0.36      0.60      0.45         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
result_table.to_csv('/content/drive/MyDrive/Colab Notebooks/NLP_Project/result_full.csv')

In [None]:
# model_save_folder = 'model/'
# tokenizer_save_folder = 'tokenizer/'

# path_model = F'/kaggle/working/{model_save_folder}'
# path_tokenizer = F'/kaggle/working/{tokenizer_save_folder}'

# ##create the dir

# !mkdir -p {path_model}
# !mkdir -p {path_tokenizer}

# ### Now let's save our model and tokenizer to a directory
# model.save_pretrained(path_model)
# tokenizer.save_pretrained(path_tokenizer)

# model_save_name = 'fineTuneModel.pt'
# path = path_model = F'/kaggle/working/{model_save_folder}/{model_save_name}'
# torch.save(model.state_dict(),path);

# Conclusion

#### - With Transfer learning approach , We are using pretrained BERT model to classify tweets in the dataset with Negative , Neutral and Positive , Hope you find this kernal as useful 

### Kindly upvote if you like it