Install all necessary dependencies and import necessary libraries

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import requests
import regex as re
from tqdm import tqdm

import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from google.colab import files

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

Download the kaggle dataset for twitter sentiments and unzip it into the data directory

In [3]:
!pip install -q kaggle
!mkdir ~/.kaggle/
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/kaggle/kaggle.json
!kaggle competitions download -c tweet-sentiment-extraction
!mkdir data
!unzip -n -d data tweet-sentiment-extraction.zip

mkdir: cannot create directory ‘/root/.kaggle/’: File exists
chmod: cannot access '/root/kaggle/kaggle.json': No such file or directory
tweet-sentiment-extraction.zip: Skipping, found more recently modified local copy (use --force to force download)
mkdir: cannot create directory ‘data’: File exists
Archive:  tweet-sentiment-extraction.zip


Download Sentiment140 dataset that only contains positive and negative samples

In [4]:
!mkdir -p data1
!wget -nc http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip -P data1
!unzip -n -d data1 /content/data1/trainingandtestdata.zip

--2022-09-29 00:04:23--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2022-09-29 00:04:23--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘data1/trainingandtestdata.zip’


2022-09-29 00:04:26 (33.2 MB/s) - ‘data1/trainingandtestdata.zip’ saved [81363704/81363704]

Archive:  /content/data1/trainingandtestdata.zip
  inflating: data1/testdata.manual.2009.06.14.csv  
  inflating: data1/training.1600000.processed.noemoticon.csv  


Build data preprocessing function to put the text into a format the model can learn from and put the outputs into a list format for BERT pretrained model.

In [5]:
TRAIN_CSV = '/content/data/train.csv'
TEST_CSV = '/content/data/test.csv'

In [6]:
def data_preprocess(df, text_col, label_col, num_labels, label_encodings=None):
  if label_encodings is not None:
    df[label_col] = df[label_col].apply(lambda x: label_encodings[x])
  df[text_col] = df[text_col].apply(lambda x: re.sub('@\w*', '', str(x)).strip())
  def build_list(x):
    res = [0 for i in range(num_labels)]
    res[x] = 1.0
    return res

  df[label_col] = df[label_col].apply(lambda x: build_list(x))
  return df

In [7]:
TEXT = 'text'
SENTIMENT = 'sentiment'

In [8]:
df = pd.read_csv(TRAIN_CSV).drop('selected_text', axis=1)
df = data_preprocess(df, 'text', 'sentiment', 3, {'negative':0, 'neutral':1, 'positive':2})
df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","[0, 1.0, 0]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,"[1.0, 0, 0]"
2,088c60f138,my boss is bullying me...,"[1.0, 0, 0]"
3,9642c003ef,what interview! leave me alone,"[1.0, 0, 0]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on th...","[1.0, 0, 0]"


Find information on the dataset to find the corrent length to set the BERT tokenizer to.

In [9]:
df['text'].apply(lambda x: len(x.split())).describe()

count    27481.000000
mean        12.892326
std          6.922524
min          0.000000
25%          7.000000
50%         12.000000
75%         18.000000
max         33.000000
Name: text, dtype: float64

Build the torch dataset and neural network architecture using the pretrained BERT model.

In [10]:
class TwitterSentimentDataset(Dataset):
  def __init__(self,
               text,
               polarity,
               max_len=64,
               model_name = 'bert-base-uncased'):
    self.text = text
    self.polarity = polarity
    self.max_len = max_len
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
  
  def __len__(self):
    return len(self.text)

  def __getitem__(self, index):
    input = self.tokenizer.encode_plus(text = self.text[index],
                                        add_special_tokens=True,
                                        padding='max_length',
                                        max_length = self.max_len,
                                        return_tensors='pt',
                                        truncation=True,
                                        return_attention_mask=True)
    output = self.polarity[index]
    return torch.LongTensor(input['input_ids']), torch.LongTensor(input['attention_mask']), torch.FloatTensor(output)

In [11]:
class TwitterSentimentModel(nn.Module):
  def __init__(self, model_name = 'bert-base-uncased', num_classes=3):
    super(TwitterSentimentModel, self).__init__()
    self.bert = BertModel.from_pretrained(model_name)
    self.dropout1 = nn.Dropout(0.3)
    self.dropout2 = nn.Dropout(0.3)
    self.lin = nn.Linear(768, 64)
    self.relu = nn.ReLU()
    self.classifier = nn.Linear(64, num_classes)
    self.softmax = nn.Softmax(-1)

  def forward(self, input, attention_mask):
    x = self.bert(input, attention_mask)[1]
    x = self.dropout1(x)
    x = self.lin(x)
    x = self.relu(x)
    x = self.dropout2(x)
    x = self.classifier(x)
    x = self.softmax(x)
    return x

Split the data into train and validation datasets so we can see how the model is performing during training and identify when it begins to overfit.

In [12]:
train, val = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

Initialize the torch datasets and wrap them into the DataLoader class wtih batch size of 64, 2 workers, shuffling the data on each epoch, and not dropping the remaining data after splitting into batches.

In [13]:
train_dataset = TwitterSentimentDataset(train[TEXT].tolist(), train[SENTIMENT].tolist())
val_dataset = TwitterSentimentDataset(val[TEXT].tolist(), val[SENTIMENT].tolist())

train_loader = DataLoader(train_dataset, batch_size=64, num_workers=2, shuffle=True, pin_memory=False, drop_last=False)
val_loader = DataLoader(val_dataset, batch_size=64, num_workers=2, shuffle=True, pin_memory=False, drop_last=False)

In [14]:
def read_data(data):
  '''
  Load the data into the cuda device
  '''
  return tuple(x.cuda() for x in data[0:-1]), data[-1].cuda()

In [15]:
def get_pred(data):
  '''
  Set the prediction to 1 if the prediction exceeds 0.5 probability
  '''
  results = []
  for d in data:
    temp = []
    for i in d:
      if i > 0.5:
        temp.append(1)
      else:
        temp.append(0)
    results.append(temp)
  return results

Train the neural network here

In [16]:
def train(model, train_loader, val_loader, epochs, learning_rate):
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
  loss_fn = torch.nn.CrossEntropyLoss()

  loss_saves = []
  accuracy_saves = []

  for e in range(epochs):
    tbar = tqdm(train_loader)
    loss_temp = []
    acc_temp = []
    for batch, (X, mask, Y) in enumerate(tbar):
      data, target = read_data((X, mask, Y))
      optimizer.zero_grad()
      preds = model(data[0].squeeze(), data[1].squeeze())
      loss = loss_fn(preds, target)
      loss.backward()
      optimizer.step()

      loss_temp.append(loss.detach().cpu().numpy().ravel())

      predicts = get_pred(preds.detach().cpu().numpy())
      acc = accuracy_score(predicts, target.detach().cpu().numpy())
      acc_temp.append(acc)

      tbar.set_description('Epoch: %i  Loss: %f  Accuracy %f' % (e, np.round(np.mean(loss_temp), 4), np.round(np.mean(acc_temp), 4)))

    model.eval()
    val_loss_temp = []
    val_acc_temp = []
    vbar = tqdm(val_loader)
    with torch.no_grad():
      for batch, (X, mask, y) in enumerate(vbar):
        data, target = read_data((X, mask, y))
        preds = model(data[0].squeeze(), data[1].squeeze())
        loss = loss_fn(preds, target)
        val_loss_temp.append(loss.detach().cpu().numpy().ravel())
        predicts = get_pred(preds.detach().cpu().numpy())
        acc = accuracy_score(predicts, target.detach().cpu().numpy())
        val_acc_temp.append(acc)
        vbar.set_description('Epoch: %i  Val Loss: %f  Val Accuracy %f' % (e, np.round(np.mean(val_loss_temp), 4), np.round(np.mean(val_acc_temp), 4)))
    
    torch.save(model.state_dict(), '/content/checkpoints/' + str(e) + '_checkpoint.pt')

In [17]:
model = TwitterSentimentModel('bert-base-uncased')
model.cuda()
!mkdir checkpoints
train(model, train_loader, val_loader, 5, 0.000005)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


mkdir: cannot create directory ‘checkpoints’: File exists


Epoch: 0  Loss: 0.977800  Accuracy 0.409900: 100%|██████████| 344/344 [01:03<00:00,  5.42it/s]
Epoch: 0  Val Loss: 0.856400  Val Accuracy 0.732400: 100%|██████████| 86/86 [00:05<00:00, 17.02it/s]
Epoch: 1  Loss: 0.797800  Accuracy 0.776600: 100%|██████████| 344/344 [01:01<00:00,  5.56it/s]
Epoch: 1  Val Loss: 0.794500  Val Accuracy 0.765900: 100%|██████████| 86/86 [00:05<00:00, 16.86it/s]
Epoch: 2  Loss: 0.733600  Accuracy 0.829400: 100%|██████████| 344/344 [01:01<00:00,  5.57it/s]
Epoch: 2  Val Loss: 0.779400  Val Accuracy 0.771600: 100%|██████████| 86/86 [00:05<00:00, 16.99it/s]
Epoch: 3  Loss: 0.698200  Accuracy 0.860800: 100%|██████████| 344/344 [01:01<00:00,  5.57it/s]
Epoch: 3  Val Loss: 0.774700  Val Accuracy 0.774800: 100%|██████████| 86/86 [00:05<00:00, 16.98it/s]
Epoch: 4  Loss: 0.677900  Accuracy 0.878400: 100%|██████████| 344/344 [01:01<00:00,  5.58it/s]
Epoch: 4  Val Loss: 0.782100  Val Accuracy 0.766000: 100%|██████████| 86/86 [00:05<00:00, 16.95it/s]


Load the test / holdout dataset to evaluate how well our model performs

In [18]:
test_df = pd.read_csv(TEST_CSV)
test_df = data_preprocess(test_df, 'text', 'sentiment', 3, {'negative':0, 'neutral':1, 'positive':2})
test_df.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,"[0, 1.0, 0]"
1,96d74cb729,Shanghai is also really exciting (precisely --...,"[0, 0, 1.0]"
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...","[1.0, 0, 0]"
3,01082688c6,happy bday!,"[0, 0, 1.0]"
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,"[0, 0, 1.0]"


In [19]:
test_dataset = TwitterSentimentDataset(test_df[TEXT].tolist(), test_df[SENTIMENT].tolist())

test_loader = DataLoader(test_dataset, batch_size=64, num_workers=2, shuffle=True, pin_memory=False, drop_last=False)

In [20]:
test_acc = []
for (X, mask, y) in test_loader:
  data, target = read_data((X, mask, y))
  model.eval()
  with torch.no_grad():
    preds = model(data[0].squeeze(), data[1].squeeze())
    predicts = get_pred(preds.detach().cpu().numpy())
    acc = accuracy_score(predicts, target.detach().cpu().numpy())
    test_acc.append(acc)

In [21]:
print('The accuracy on the test set is: ' + str(np.round(np.mean(test_acc), 4)))

The accuracy on the test set is: 0.7769


Handwritten sentences to ensure the model works

In [25]:
x = ['This food is disgusting',
     'This app is very well made',
     'This repository looks good',
     'I never want to eat this food again',
     'This outfit is very ugly',
     'When I bought the level it worked really poorly for me',
     'Apple does not care about their customers anymore and the new product is proof of that',
     'Amazon works really hard and makes getting shipments much easier',
     'The food was good when the restaurant first opened but now it tastes really bad',
     'The phone sucks now but it used to be good but now it is bad again',
     'The guitar used to sound bad but now it sounds beautiful good amazing',
     'Are you stupid?',
     'You are so wrong it hurts my head',
     'Awesome I also talked to the ta and we are getting full credit on hw3 and hw4',
     'Sounds good. Also, I am going to go ahead and try to resubmit hw3 today after I do the test cases so let me know when someone approves the pr i throw up later',
     'I am going to go to the grocery store']
def run_model(x):
  decode = {0:'NEGATIVE', 1:'NEUTRAL', 2:'POSITIVE'}
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
  input = tokenizer.encode_plus(text = x,
                                          add_special_tokens=True,
                                          padding='max_length',
                                          max_length = 64,
                                          return_tensors='pt',
                                          truncation=True,
                                          return_attention_mask=True)
  preds = model(input['input_ids'].cuda(), input['attention_mask'].cuda()).detach().cpu().numpy().ravel()
  p = np.argmax(preds)
  print(preds)
  return decode[p]

for i in x:
  print('Sentence ------- Prediction')
  print(i + ': ' + run_model(i))

Sentence ------- Prediction
[0.98529214 0.01268106 0.00202685]
This food is disgusting: NEGATIVE
Sentence ------- Prediction
[0.00109776 0.00397609 0.9949261 ]
This app is very well made: POSITIVE
Sentence ------- Prediction
[0.00152129 0.00410953 0.9943692 ]
This repository looks good: POSITIVE
Sentence ------- Prediction
[0.98225844 0.01502151 0.00272002]
I never want to eat this food again: NEGATIVE
Sentence ------- Prediction
[0.9851163  0.01246835 0.0024154 ]
This outfit is very ugly: NEGATIVE
Sentence ------- Prediction
[0.98575586 0.01215103 0.00209305]
When I bought the level it worked really poorly for me: NEGATIVE
Sentence ------- Prediction
[0.75181055 0.24224073 0.0059487 ]
Apple does not care about their customers anymore and the new product is proof of that: NEGATIVE
Sentence ------- Prediction
[0.00110285 0.00706861 0.9918285 ]
Amazon works really hard and makes getting shipments much easier: POSITIVE
Sentence ------- Prediction
[0.982128   0.01621334 0.00165862]
The foo