In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
sentences_df = pd.read_csv('./data/SST2/train.tsv',delimiter='\t',header=None,\
                           names=['sentence','label'])

In [None]:
sentences_df.head()

In [None]:
new_df = sentences_df.sample(n=2000)

In [None]:
new_df.head()

In [None]:
new_df['label'].value_counts()

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = new_df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
tokenized.head()

In [None]:
# Find the sentence with the max length
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

In [None]:
# Adjust every sentence to the same length
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
padded.shape
padded[0]

In [None]:
attention_mask = np.where(padded != 0, 1, 0)

In [None]:
attention_mask.shape

In [None]:
attention_mask[0]

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
last_hidden_states[0].size()

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = new_df['label']

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [None]:
len(train_features)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(test_features, test_labels)

In [None]:
# Let's feed it some of our own data (tricky!)
s = [['I hate people who hate this movie',1],
     ['I hate people who hate this movie, because I love it',1],
     ['I love people who do not love this movie',0],
     ['This movie is great',1]]

In [None]:
df2 = pd.DataFrame(data=s)

In [None]:
df2

In [None]:
tokens2 = df2[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
tokens2

In [None]:
padded2 = np.array([i + [0]*(max_len-len(i)) for i in tokens2.values])

In [None]:
padded2

In [None]:
attention_mask2 = np.where(padded2 != 0, 1, 0)
input_ids2 = torch.tensor(padded2)
attention_mask2 = torch.tensor(attention_mask2)

with torch.no_grad():
    last_hidden_states2 = model(input_ids2, attention_mask=attention_mask2)

In [None]:
features2 = last_hidden_states2[0][:,0,:].numpy()

In [None]:
labels2 = df2[1]

In [None]:
labels2

In [None]:
# This is our output sentiment for the new reviews
lr_clf.predict(features2)
# Compared to the labels, we got 50% correct. YAY!

In [None]:
features2.shape

### Let's create a neural net that gets better results!

In [None]:
# Create cuda device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Put our input data onto device
last_hidden_states = last_hidden_states[0][:,0,:].to(device)

In [None]:
last_hidden_states.size()

In [None]:
# Define neural network class to be trained
# Structure:
# input -> fc1 -> sigmoid -> out -> log_softmax
import torch.nn as nn
import torch.nn.functional as F
class Shallow_Network(nn.Module):
    def __init__(self):
        super(Shallow_Network,self).__init__()
        self.fc1 = nn.Linear(768,1000)
        self.out = nn.Linear(1000,2)
    def forward(self,input):
        # Take input, feed through fc1 layer,
        # then apply activation function to it
        x = F.sigmoid(self.fc1(input))
        # Take output of sigmoid, input into out layer,
        # and apply log_softmax function
        return (F.log_softmax(self.out(x),dim=1))

In [None]:
# Create neural network object
net = Shallow_Network()
net = net.to(device)

In [None]:
# Export labels
labels_tensor = torch.tensor(labels.values)

In [None]:
# Put on device
labels_tensor = labels_tensor.to(device)

In [None]:
labels_tensor

In [None]:
import torch.optim as optim
#Create an stochastic gradient descent optimizer
adam = optim.Adam(net.parameters(), lr=0.001)
loss_func = nn.NLLLoss()
loss_func.to(device)

In [None]:
# Train network
cnt = 0
average_losses = []
average_val_losses = []
acc = []
cur_loss = []
for epoch in range(100):
    net.train()
    #zero the gradient
    adam.zero_grad()
    #Get output of network
    probs = net(last_hidden_states)
    #compute loss
    loss = loss_func(probs,labels_tensor)
    #compute the backward gradient and move network in that direction
    loss.backward()
    adam.step()
    #gather loss
    cur_loss.append(loss.detach().cpu().numpy())
    print("epoch ",epoch)
    print("training loss: ", np.mean(cur_loss))

In [None]:
# Get classification probabilities from hidden state array
# And apply Softmax
with torch.no_grad():
    probs = net(last_hidden_states)
    softprobs = F.softmax(probs)

In [None]:
softprobs

In [None]:
# Get most likely class and its index for each sample point
values, indices = torch.max(softprobs,1)

In [None]:
# Predicted labels
indices

In [None]:
# Take original labels
labels_tensor

In [None]:
# Calculate number of sample points where prediction failed
nums = torch.sum(torch.abs(labels_tensor-indices)).detach().cpu().numpy()

In [None]:
# Number of correct predictions
numcorrect = 2000-(nums+0)

In [None]:
# Accuracy of prediction
accuracy = numcorrect/2000

In [None]:
accuracy

In [None]:
idx = np.asarray(indices.detach().cpu().numpy())
lbls = np.asarray(labels_tensor.detach().cpu().numpy())

In [None]:
new_df.iloc[np.where(idx-lbls)]