In [None]:
#==============================================================================
# CellStrat Hub Pack - Natural Language Processing
# Compatible tier : Free Tier or above 
# Kernel : conda_pytorch_latest_p36 
#==============================================================================

In [1]:

# -*- coding: utf-8 -*-

#==============================================================================================================
# Install Trasnformers
# Transformers  provides general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…)
# for Natural Language Understanding (NLU) and Natural Language Generation (NLG)
#with over 32+ pretrained models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.
#================================================================================================================
!pip install transformers



In [2]:
#==============================================================================================================
# Import the necessary libraries
#==============================================================================================================
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')


In [3]:
#==============================================================================================================
# Load the training dataset
# The data consists of sequence and corresponing sentiments ( 0 -> Negative, 1->Positive)
#==============================================================================================================
df=pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv',delimiter='\t',header=None)

In [4]:
#==============================================================================================================
# View a sample of data
#==============================================================================================================

batch=df[:500]
batch

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1
...,...,...
495,while the frequent allusions to gurus and dosh...,1
496,"the movie is hardly a masterpiece , but it doe...",1
497,unless you come in to the film with a skateboa...,0
498,an energetic and engaging film that never pret...,1


In [5]:
#==============================================================================================================
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base.
#  It has 40% less parameters than bert-base-uncased, runs 60% faster while preserving over 95% of BERT’s performances
# as measured on the GLUE language understanding benchmark.
#==============================================================================================================
from transformers import DistilBertModel, DistilBertConfig,DistilBertTokenizer

In [6]:
#==============================================================================================================
# Assign the model, tokenizer and weights to the variables
#==============================================================================================================
model_class=DistilBertModel
tokenizer_class=DistilBertTokenizer
pretrained_weights='distilbert-base-uncased'

In [7]:
#==============================================================================================================
# Load the weights from the pretrained model
#==============================================================================================================
tokenizer=tokenizer_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
model=model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
#==============================================================================================================
# Encode the input sequences , add special tokens
#==============================================================================================================
tokenized=batch[0].apply(lambda x:tokenizer.encode(x,add_special_tokens=True))

In [10]:
#==============================================================================================================
# find the maximum length of the list

# concatenate the lists with zeroes for remaining length to make the sequence uniform
#=============================================================================================================
max_len=0

for i in tokenized.values:
  if len(i) > max_len :
    max_len=len(i) 

padded=np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]) 

In [11]:
#Display the shape of the padded array
padded.shape

(500, 54)

In [12]:
#==============================================================================================================
# Attention Mask - The attention mask is a binary tensor indicating the position of the padded indices so that
# the model does not attend to them.
#=============================================================================================================
attention_mask=np.where(padded!=0,1,0)
attention_mask.shape

(500, 54)

In [13]:
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [14]:
#Construct a tensor with padded attribute
input_ids=torch.tensor(padded)
input_ids

tensor([[  101,  1037, 18385,  ...,     0,     0,     0],
        [  101,  4593,  2128,  ...,     0,     0,     0],
        [  101,  2027,  3653,  ...,     0,     0,     0],
        ...,
        [  101,  4983,  2017,  ...,     0,     0,     0],
        [  101,  2019, 18114,  ...,     0,     0,     0],
        [  101,  1037, 17075,  ...,     0,     0,     0]])

In [15]:
#convert the attention array to torch tenosr
attention_mask=torch.tensor(attention_mask)

In [16]:
#==============================================================================================================
# Embed the sentence or the input ids# the model does not attend to them.
# The outout is encoded senttence with 768 dim, 
# The output is rep in 3d way by #number of sentence,#tokens in sentence,sentene embedding dimension
# The dimension of output is (2000,66,768) 
#=============================================================================================================


with torch.no_grad():
    last_hidden_states=model(input_ids,attention_mask=attention_mask)
 

In [25]:
#Access the first token from each of the sentences
features=last_hidden_states[0][:,0,:].numpy()

In [26]:
labels=batch[1]

In [27]:
#Split the dataset ino=to train/test features
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [28]:
#==============================================================================================================
# Use the embedded word vectors as input for performing classification
# Logistic regression can be used as model to classify the inputs
#=============================================================================================================
lr_clf = LogisticRegression(C=5.2)

lr_clf.fit(train_features, train_labels)

LogisticRegression(C=5.2)

In [29]:
#evalute the score - the model gives accuracy of ~77%
lr_clf.score(test_features, test_labels)

0.72

In [30]:
#==============================================================================================================
# Predict the sentiment of the test data 
#=============================================================================================================
y_pred = lr_clf.predict(test_features)

In [31]:
y_pred

array([1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0])

In [31]:
#==============================================================================================================
# print the confusion matrix
# Implies 47 true positive, 50 true negative
# 12 and 16 are false positive and false negative respectively
#=============================================================================================================
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(test_labels, y_pred)
print(confusion_matrix)

[[47 12]
 [16 50]]
