# Sentiment Analysis with BERT for Beginners

# The original code is from Jay Alammar 

https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb

### Internal Enabling GPU (ignore it) 

In [1]:
try:
     import subprocess, os, numpy as np
     gpu_free_memories = [int(m) for m in subprocess.Popen("nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader", 
stdout=subprocess.PIPE, 
shell=True).communicate()[0].strip().decode().split("\n")]
     most_free_gpu = str(np.argmax(gpu_free_memories))
     os.environ["CUDA_VISIBLE_DEVICES"] = most_free_gpu
     print("Using GPU #" + most_free_gpu)
except Exception as e:
     print("Could not select GPU. Exception:", e)


Using GPU #0


In [2]:
#install transformers

!pip3 install transformers

Collecting transformers
  Using cached https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl
Collecting regex!=2019.12.17 (from transformers)
  Using cached https://files.pythonhosted.org/packages/63/ae/057469010c579f6f6f37fc9b63b2d70af4ec20f2a7e2f3a1c4f364b4dde0/regex-2020.1.8-cp36-cp36m-manylinux1_x86_64.whl
Collecting requests (from transformers)
  Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl
Collecting sentencepiece (from transformers)
  Using cached https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl
Collecting tqdm (from transformers)
  Using cached https://files.pythonhosted.org/packages/72/c9/7fc20feac72e79032a7c8138fd0d395dc6d8812b5b9edf53c3afd0b31017/tqdm-4.41.1-py2.py3-n

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

I0115 09:51:55.027056 139667892651840 file_utils.py:35] PyTorch version 1.2.0 available.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Load data and inspect it (show how many examples are labeled as "positive" (1) and how many are labeled "negative" (0)

In [3]:
df = pd.read_csv('train.tsv.txt', delimiter='\t', header=None)

In [4]:
train_data = df
train_data

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
6915,"painful , horrifying and oppressively tragic ,...",1
6916,take care is nicely performed by a quintet of ...,0
6917,"the script covers huge , heavy topics in a bla...",0
6918,a seriously bad film with seriously warped log...,0


In [5]:
train_data[1].value_counts()

1    3610
0    3310
Name: 1, dtype: int64

In [6]:
# For BERT change to (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

# Load pretrained tokenizer and model
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

I0115 09:51:56.273807 139667892651840 tokenization_utils.py:398] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/vogel/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0115 09:51:56.977018 139667892651840 configuration_utils.py:185] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at /home/vogel/.cache/torch/transformers/a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.1ccd1a11c9ff276830e114ea477ea2407100f4a3be7bdc45d37be9e37fa71c7e
I0115 09:51:56.979455 139667892651840 configuration_utils.py:199] Model config {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 

## Tokenize the text data

BERT breaks the input into words and subwords. Then special tokens for sentence classifications at the beginning of the text ([CLS]) and at the end of the sentence ([SEP]) are added. In the last step the tokenizer replaces the tokens with embedding ids.


In [7]:
tokenized_text = train_data[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

## That's what the input text looks like after tokenization

In [8]:
tokenized_text

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
6915    [101, 9145, 1010, 7570, 18752, 14116, 1998, 28...
6916    [101, 2202, 2729, 2003, 19957, 2864, 2011, 103...
6917    [101, 1996, 5896, 4472, 4121, 1010, 3082, 7832...
6918    [101, 1037, 5667, 2919, 2143, 2007, 5667, 2561...
6919    [101, 1037, 12090, 2135, 2512, 5054, 19570, 23...
Name: 0, Length: 6920, dtype: object

## Padding all sentences to the same length with the token id 0

In [9]:
max_len = 0
# find longest text
for i in tokenized_text.values:
    #print(i)
    if len(i) > max_len:
        max_len = len(i)
#if sentence is shorter than the longest sentence, padd zeros 
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized_text.values])
padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  1996,  5896, ...,     0,     0,     0],
       [  101,  1037,  5667, ...,     0,     0,     0],
       [  101,  1037, 12090, ...,     0,     0,     0]])

## The sentences are now 2-d arrays with the following dimensions: tokens and number of input examples

In [10]:
np.array(padded).shape

(6920, 67)

## `attention_mask`: It's a mask to be used if the input sequence length is smaller than the max input sequence length in the current batch. It's the mask that we typically use for attention when a batch has varying length sentences.
 

### The “Attention Mask” is simply an array of 1s (tokens) and 0s (padded zeros) indicating which tokens are padding and which aren’t 

[Attention Mask](https://mccormickml.com/2019/07/22/BERT-fine-tuning/#sentence-length--attention-mask)
<img src="masking.PNG" height=10 width=500>

In [11]:
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask)
print(attention_mask.shape)

[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]
(6920, 67)


## The model() function runs the sentences through BERT

### last_hidden_states are the outputs of DistilBERT. 
The output of DistilBert is a 3-d tuple with the shape / dimensions --> number of examples (6920), max number of tokens in the sequence (67), number of hidden units in the DistilBERT model (768). 


In [12]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

## The output is a vector for each input token. And each vector contains 768 numbers (hidden units in the DistilBERT model).

In [13]:
last_hidden_states

(tensor([[[-0.2159, -0.1403,  0.0083,  ..., -0.1369,  0.5867,  0.2011],
          [-0.2471,  0.2468,  0.1008,  ..., -0.1631,  0.9349, -0.0715],
          [ 0.0558,  0.3573,  0.4140,  ..., -0.2430,  0.1770, -0.5080],
          ...,
          [ 0.1864,  0.0193,  0.1864,  ..., -0.2175,  0.1604, -0.4050],
          [-0.1004,  0.0651,  0.1240,  ..., -0.1649,  0.3568,  0.1218],
          [-0.0114,  0.3297,  0.2317,  ..., -0.2362,  0.4217,  0.0895]],
 
         [[-0.1726, -0.1448,  0.0022,  ..., -0.1744,  0.2139,  0.3720],
          [ 0.0022,  0.1684,  0.1269,  ..., -0.1888, -0.0195, -0.0283],
          [ 0.0257, -0.2458,  0.0717,  ..., -0.4339,  0.1622,  0.0133],
          ...,
          [ 0.0466,  0.0850,  0.1801,  ..., -0.0279,  0.1878,  0.4022],
          [-0.2325,  0.0746,  0.1298,  ..., -0.1292,  0.0904,  0.3647],
          [-0.0655, -0.2214,  0.1827,  ..., -0.1624,  0.1421,  0.0963]],
 
         [[-0.0506,  0.0720, -0.0296,  ..., -0.0715,  0.7185,  0.2623],
          [ 0.0536,  0.3136,

## Only the first position ([CLS]) is needed for classification (the 768 float numbers from DistilBERT which corresponds to the sentences in the dataset) 

In [14]:
features = last_hidden_states[0][:,0,:].numpy() #[:,0,:] --> [:(all sentences),0 (only the first position CLS),: (all hidden units output)

In [15]:
features

array([[-0.21593462, -0.14028919,  0.00831093, ..., -0.13694865,
         0.58670056,  0.20112726],
       [-0.17262723, -0.14476144,  0.00223403, ..., -0.17442508,
         0.21386462,  0.37197468],
       [-0.05063348,  0.07203963, -0.02959663, ..., -0.07148966,
         0.71852344,  0.26225492],
       ...,
       [-0.06550992, -0.05184716, -0.14094445, ..., -0.06450661,
         0.60223   ,  0.21347886],
       [-0.08523144, -0.04869819, -0.08137506, ..., -0.13589351,
         0.39505625,  0.22889729],
       [-0.29436848, -0.0923472 , -0.00831686, ..., -0.05159127,
         0.43497843,  0.28891596]], dtype=float32)

## Labels

In [16]:
labels = train_data[1]

## Logistic Regression Model

## Train (75%) / Test Split (25% of data)

In [17]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

## Train Logistic Regression Model

In [18]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Evaluation of Model

In [19]:
lr_clf.score(test_features, test_labels)

0.8445086705202313

## Try out your own examples

In [21]:
# vectorize input

def vectorize_input(list_of_strings):
    df_sent = pd.DataFrame(list_of_strings)
    tokenize_sent = df_sent[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    padding_pos = np.array([i + [0]*(max_len-len(i)) for i in tokenize_sent])
    attention = np.where(padding_pos != 0, 1, 0)
    input_id = torch.tensor(padding_pos)  
    attention = torch.tensor(attention)

    with torch.no_grad():
        last_hidden_state = model(input_id, attention_mask=attention)

    feature = last_hidden_state[0][:,0,:].numpy()
    return feature 

In [27]:
pos = ["Joker is the best movie of our times worth watching This movie earns the audience applause"]
#pos = ["Joker is the worse movie of our times so not worth watching This movie is so bad"]

vectorize_pos = vectorize_input(pos) 

print("Pos prediction for the sentence " "{}:" "{}". format(pos, lr_clf.predict(vectorize_pos)))

Pos prediction for the sentence ['Joker is the best movie of our times worth watching This movie earns the audience applause']:[1]
