<a href="https://colab.research.google.com/github/hamzajb95/CE888/blob/main/Assignment/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

#BERT requirements:
!pip install tqdm boto3 requests regex sentencepiece sacremoses

#!pip install pytorch-lightening==1.1.0

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 10.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 50.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 42.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=5ad0

In [4]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb

from pathlib import Path
import urllib
import re
import string
import math
from nltk.corpus import stopwords as stopwords
from nltk.stem.porter import *
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
#You pass the url to the function and you are returned a list
def getText(url):
    response = urllib.request.urlopen(url)
    Text = response.read()
    Text=Text.decode("utf-8")
    lines = Text.split('\n')
    return lines[:-1]

pd.set_option('display.max_colwidth', None)

#Importing data for emotion dataset
url_mapping = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt'
url_trainText = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_text.txt'
url_trainLabel = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_labels.txt'
url_testText = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_text.txt'
url_testLabel = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_labels.txt'

emotion_list = [url_mapping,url_trainText,url_trainLabel,url_testText,url_testLabel]

def preProcess(urlList):
  mapping = getText(urlList[0])
  train_text = getText(urlList[1])
  train_labels = getText(urlList[2])
  test_text = getText(urlList[3])
  test_labels = getText(urlList[4])

  #Turn those train and test lists into Dataframes.
  df=pd.DataFrame({"tweet":train_text,"label":train_labels}) #change name to df_train
  df_test = pd.DataFrame({"tweet":test_text, "label":test_labels})

  #We are now going to remove the @user tags from tweets for both train and test sets.
  df['tweet']=df['tweet'].apply(lambda x: re.sub('@user','', x))
  df_test['tweet']=df_test['tweet'].apply(lambda x: re.sub('@user','', x))
  #We are going to change all words to lower case
  df['tweet']=df['tweet'].apply(lambda x: x.lower())
  df_test['tweet']=df_test['tweet'].apply(lambda x: x.lower())
  #Remove special character from train and test set
  df['tweet'] = df['tweet'].str.replace("[^a-zA-Z]", " ")
  df_test['tweet'] = df_test['tweet'].str.replace("[^a-zA-Z]", " ")
  #Remove all stopwords from the Tweet columns of the datasets
  stop = stopwords.words('english')
  df['tweet'] = df['tweet'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
  df_test['tweet'] = df_test['tweet'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
  #Remove any words that have a length less than 4
  df['tweet'] = df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>4]))
  df_test['tweet'] = df_test['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>4]))
  #We now tokenize the dataframed in an added column
  df['tokenized_tweets'] = df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)
  df_test['tokenized_tweets'] = df_test.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)


  return df, df_test

In [10]:
preProcessed = preProcess(emotion_list)
emot_train = preProcessed[0]
emot_test = preProcessed[1]

print(emot_train)

                                                                         tweet  ...                                                                    tokenized_tweets
0          worry payment problem never joyce meyer motivation leadership worry  ...       [worry, payment, problem, never, joyce, meyer, motivation, leadership, worry]
1                          roommate spell autocorrect terrible firstworldprobs  ...                           [roommate, spell, autocorrect, terrible, firstworldprobs]
2                                                probably photos cherry helped  ...                                                  [probably, photos, cherry, helped]
3     rooneys fucking untouchable fucking dreadful depay looked decent tonight  ...  [rooneys, fucking, untouchable, fucking, dreadful, depay, looked, decent, tonight]
4                                      pretty depressing favourite highlighter  ...                                        [pretty, depressing, favourite, highl

In [5]:
#Calling the BERT model, the BERT Tokenizer and choosing the model with the hyperparameters.
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [15]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
tokenized = emot_train['tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized

0                                       [101, 4737, 7909, 3291, 2196, 11830, 11527, 14354, 4105, 4737, 102]
1                          [101, 18328, 6297, 8285, 27108, 2890, 6593, 6659, 2034, 11108, 21572, 5910, 102]
2                                                                        [101, 2763, 7760, 9115, 3271, 102]
3       [101, 24246, 2015, 8239, 19662, 10875, 3085, 8239, 21794, 2139, 4502, 2100, 2246, 11519, 3892, 102]
4                                                          [101, 3492, 2139, 24128, 8837, 12944, 2121, 102]
                                                       ...                                                 
3252                  [101, 22585, 8239, 2086, 3967, 5190, 1056, 28394, 3215, 2052, 1056, 28394, 2102, 102]
3253                                                             [101, 18974, 9936, 3842, 13267, 4064, 102]
3254                                                              [101, 3507, 13277, 8467, 2791, 9918, 102]
3255                        

In [20]:
### We need to each list in tokenized to have the same length
### We do this by first getting the tweet with the biggest length
### Then we 
maxLen = 0
for i in tokenized.values:
  if len(i) > maxLen:
    maxLen = len(i)

paded = np.array([i + [0]*(maxLen-len(i)) for i in tokenized.values])
print(paded.shape) #So just to be clear the maxLen was 35 as that is not the length of each numpy array.

#We need to create another numpy array which gives 0 to ignore the mask and 1 to accept the Token
attention_mask = np.where(paded != 0,1,0)
print(attention_matrix.shape)


(3257, 35)
(3257, 35)


In [None]:
##Lets run the BERT model by providing the model with the padded and attention mask
input_ids = torch.tensor(padded)
attention_mask = torch.tensor(attention_mask)
#Context Manager that disables gradient calculations (torch.no_grad())
#model() function runs our sentences through BERT
with torch.no_grad():
  last_states = model(input_ids, attention_mask = attention_mask)

Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called `[CLS]` (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.

<img src="https://jalammar.github.io/images/distilBERT/bert-output-tensor-selection.png" />

We'll save those in the `features` variable, as they'll serve as the features to our logitics regression model.

In [None]:
#Function to create Word Cloud
# def createWC(df):
#     wordbank = [' '.join(map(str,x)) for x in df['tokenized_tweets']]
#     wordbank = "".join(wordbank)
#     wordbank.strip()
    
#     wordCloud = WordCloud(width=800,height=500, random_state=1, background_color='salmon').generate(wordbank)
#     plt.figure(figsize=(10, 7))
#     plt.imshow(wordCloud)
#     plt.axis('off')
#     plt.show()