<a href="https://colab.research.google.com/github/hamzajb95/CE888/blob/main/Assignment/BERTLog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
 !pip install transformers

#BERT requirements:
!pip install tqdm boto3 requests regex sentencepiece sacremoses

#!pip install pytorch-lightening==1.1.0



In [6]:
import numpy as np
import pandas as pd
import torch
import transformers as ppb
from sklearn.linear_model import LogisticRegression
from pathlib import Path
import urllib
import re
import string
import math
from nltk.corpus import stopwords as stopwords
from nltk.stem.porter import *
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt


nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
#You pass the url to the function and you are returned a list
def getText(url):
    response = urllib.request.urlopen(url)
    Text = response.read()
    Text=Text.decode("utf-8")
    lines = Text.split('\n')
    return lines[:-1]

pd.set_option('display.max_colwidth', None)

#Importing data for emotion dataset
url_mapping = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt'
url_trainText = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_text.txt'
url_trainLabel = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/train_labels.txt'
url_testText = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_text.txt'
url_testLabel = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/test_labels.txt'

emotion_list = [url_mapping,url_trainText,url_trainLabel,url_testText,url_testLabel]

def preProcess(urlList):
  mapping = getText(urlList[0])
  train_text = getText(urlList[1])
  train_labels = getText(urlList[2])
  test_text = getText(urlList[3])
  test_labels = getText(urlList[4])

  #Turn those train and test lists into Dataframes.
  df=pd.DataFrame({"tweet":train_text,"label":train_labels}) #change name to df_train
  df_test = pd.DataFrame({"tweet":test_text, "label":test_labels})
  
  frames = [df,df_test]
  new_df = pd.concat(frames)

  #We are now going to remove the @user tags from tweets for both train and test sets.
  new_df['tweet']=new_df['tweet'].apply(lambda x: re.sub('@user','', x))
  df_test['tweet']=df_test['tweet'].apply(lambda x: re.sub('@user','', x))

  #We are going to change all words to lower case
  new_df['tweet']=new_df['tweet'].apply(lambda x: x.lower())
  df_test['tweet']=df_test['tweet'].apply(lambda x: x.lower())
  
  #Remove special character from train and test set
  new_df['tweet'] = new_df['tweet'].str.replace("[^a-zA-Z]", " ")
  df_test['tweet'] = df_test['tweet'].str.replace("[^a-zA-Z]", " ")

  #Remove all stopwords from the Tweet columns of the datasets
  stop = stopwords.words('english')
  new_df['tweet'] = new_df['tweet'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
  df_test['tweet'] = df_test['tweet'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
  
  #Remove any words that have a length less than 4
  new_df['tweet'] = new_df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>4]))
  df_test['tweet'] = df_test['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>4]))
  
  #We now tokenize the dataframe in an added column
  new_df['tokenized_tweets'] = new_df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)
  df_test['tokenized_tweets'] = df_test.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

  return new_df, df_test

In [8]:
emot_data = preProcess(emotion_list)
emot_train = emot_data[0]
emot_test = emot_data[1]


In [9]:
#Calling the BERT model, the BERT Tokenizer and choosing the model with the hyperparameters.
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

In [10]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

def BertTok(df):
  #tokenize
  tokenized = df['tweet'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

  ### We need each list in tokenized to have the same length
  ### We do this by first getting the tweet with the biggest length
  ### and Padding the rest as per required to get each sentence to maxLen

  maxLen = 0
  for i in tokenized.values:
    if len(i) > maxLen:
      maxLen = len(i)

  paded = np.array([i + [0]*(maxLen-len(i)) for i in tokenized.values])
  #print(paded.shape) #So just to be clear the maxLen was 35 as that is not the length of each numpy array.

  #We need to create another numpy array which gives 0 to ignore the mask and 1 to accept the Token
  attention_mask = np.where(paded != 0,1,0)
  #print(attention_mask.shape)

  ##Lets run the BERT model by providing the model with the padded and attention mask
  input_ids = torch.tensor(paded)
  attention_mask = torch.tensor(attention_mask)
  #Context Manager that disables gradient calculations (torch.no_grad())
  #model() function runs our sentences through BERT
  with torch.no_grad():
    last_states = model(input_ids, attention_mask = attention_mask)

  features = last_states[0][:,0,:].numpy()

  return features


emot_trainFeat = BertTok(emot_train) #Encoded features
emot_trainLab = emot_train['label']

# emot_tr_pad = emot_trainTok[0]
# emot_tr_mask = emot_trainTok[1]

emot_testFeat = BertTok(emot_test) #Encoded features
emot_testLab = emot_test['label']
# emot_ts_pad = emot_testTok[0]
# emot_ts_mask = emot_testTok[1]

print(emot_testFeat)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


[[-0.04321018  0.28974223 -0.2092067  ... -0.750844   -0.21634468
   0.58594793]
 [ 0.14174147  0.2754376  -0.05503751 ... -0.459177    0.21613123
   0.24762696]
 [-0.53892416 -0.06289501  0.08318505 ... -0.48963243  0.0078131
   0.21471159]
 ...
 [-0.02147441  0.3795352   0.10550389 ... -0.2581239   0.23106357
   0.25429776]
 [-0.24132793  0.32521775 -0.2545765  ... -0.18256706  0.121466
   0.18432385]
 [ 0.04160715  0.02231806  0.0599119  ... -0.14999266  0.05901751
   0.06774562]]


In [11]:
log_clf = LogisticRegression()
log_clf.fit(emot_trainFeat, emot_trainLab)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
log_clf.score(emot_testFeat,emot_testLab)

0.7241379310344828

Let's slice only the part of the output that we need. That is the output corresponding the first token of each sentence. The way BERT does sentence classification, is that it adds a token called `[CLS]` (for classification) at the beginning of every sentence. The output corresponding to that token can be thought of as an embedding for the entire sentence.

<img src="https://jalammar.github.io/images/distilBERT/bert-output-tensor-selection.png" />

We'll save those in the `features` variable, as they'll serve as the features to our logitics regression model.

In [12]:
#Function to create Word Cloud
# def createWC(df):
#     wordbank = [' '.join(map(str,x)) for x in df['tokenized_tweets']]
#     wordbank = "".join(wordbank)
#     wordbank.strip()
    
#     wordCloud = WordCloud(width=800,height=500, random_state=1, background_color='salmon').generate(wordbank)
#     plt.figure(figsize=(10, 7))
#     plt.imshow(wordCloud)
#     plt.axis('off')
#     plt.show()

In [13]:
emot_train['label']

0       2
1       0
2       1
3       0
4       3
       ..
1416    1
1417    3
1418    0
1419    0
1420    1
Name: label, Length: 4678, dtype: object