In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [None]:
sentences_df = pd.read_csv('./data/SST2/train.tsv',delimiter='\t',header=None,\
                           names=['sentence','label'])

In [None]:
sentences_df.head()

In [None]:
new_df = sentences_df.sample(n=2000)

In [None]:
test_df = sentences_df.iloc[sentences_df.index.difference(new_df.index)]

In [None]:
test_df.index

In [None]:
new_df.head()

In [None]:
new_df['label'].value_counts()

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [None]:
tokenized = new_df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
test_tokenized = test_df['sentence'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
tokenized.head()

In [None]:
# Find the sentence with the max length
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

In [None]:
# Find the sentence with the max length
test_max_len = 0
for i in test_tokenized.values:
    if len(i) > test_max_len:
        test_max_len = len(i)

In [None]:
# Adjust every sentence to the same length
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [None]:
# Adjust every sentence to the same length
test_padded = np.array([i + [0]*(test_max_len-len(i)) for i in test_tokenized.values])

In [None]:
padded.shape
padded[0]

In [None]:
attention_mask = np.where(padded != 0, 1, 0)

In [None]:
test_attention_mask = np.where(test_padded != 0, 1, 0)

In [None]:
attention_mask.shape

In [None]:
attention_mask[0]

In [None]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
test_input_ids = torch.tensor(test_padded)  
test_attention_mask = torch.tensor(test_attention_mask)

with torch.no_grad():
    test_last_hidden_states = model(test_input_ids, attention_mask=test_attention_mask)

In [None]:
last_hidden_states[0].size()

In [None]:
features = last_hidden_states[0][:,0,:].numpy()

In [None]:
test_features = test_last_hidden_states[0][:,0,:].numpy()

In [None]:
labels = new_df['label']

In [None]:
test_labels = test_df['label']

In [None]:
train_features, val_features, train_labels, val_labels = train_test_split(features, labels)

In [None]:
len(train_features)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [None]:
lr_clf.score(val_features, val_labels)

In [None]:
# Let's feed it some of our own data (tricky!)
s = [['I hate people who hate this movie',1],
     ['I hate people who hate this movie, because I love it',1],
     ['I love people who do not love this movie',0],
     ['This movie is great',1]]

In [None]:
df2 = pd.DataFrame(data=s)

In [None]:
df2

In [None]:
tokens2 = df2[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [None]:
tokens2

In [None]:
padded2 = np.array([i + [0]*(max_len-len(i)) for i in tokens2.values])

In [None]:
padded2

In [None]:
attention_mask2 = np.where(padded2 != 0, 1, 0)
input_ids2 = torch.tensor(padded2)
attention_mask2 = torch.tensor(attention_mask2)

with torch.no_grad():
    last_hidden_states2 = model(input_ids2, attention_mask=attention_mask2)

In [None]:
features2 = last_hidden_states2[0][:,0,:].numpy()

In [None]:
labels2 = df2[1]

In [None]:
labels2

In [None]:
# This is our output sentiment for the new reviews
lr_clf.predict(features2)
# Compared to the labels, we got 50% correct. YAY!

In [None]:
features2.shape

### Let's create a neural net that gets better results!

In [None]:
# Create cuda device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
train_features_tensor = torch.tensor(np.asarray(train_features))
train_features_tensor = train_features_tensor.to(device)

val_features_tensor =  torch.tensor(np.asarray(val_features))
val_features_tensor = torch.tensor(val_features_tensor).to(device)

test_features_tensor =  torch.tensor(np.asarray(test_features))
test_features_tensor = torch.tensor(test_features_tensor).to(device)

train_labels_tensor =  torch.tensor(np.asarray(train_labels))
train_labels_tensor = torch.tensor(train_labels_tensor).to(device)

val_labels_tensor = torch.tensor(np.asarray(val_labels))
val_labels_tensor = torch.tensor(val_labels_tensor).to(device)

test_labels_tensor = torch.tensor(np.asarray(test_labels))
test_labels_tensor = torch.tensor(test_labels_tensor).to(device)

In [None]:
train_features_tensor.shape

In [None]:
# Put our input data onto device
last_hidden_states = last_hidden_states[0][:,0,:].to(device)

In [None]:
last_hidden_states.size()

In [None]:
# Define neural network class to be trained
# Structure:
# input -> fc1 -> sigmoid -> out -> log_softmax
import torch.nn as nn
import torch.nn.functional as F
class Shallow_Network(nn.Module):
    def __init__(self):
        super(Shallow_Network,self).__init__()
        self.fc1 = nn.Linear(768,1000)
        self.out = nn.Linear(1000,2)
    def forward(self,input):
        # Take input, feed through fc1 layer,
        # then apply activation function to it
        x = F.sigmoid(self.fc1(input))
        # Take output of sigmoid, input into out layer,
        # and apply log_softmax function
        return (F.log_softmax(self.out(x),dim=1))

In [None]:
# Create neural network object
net = Shallow_Network()
net = net.to(device)

In [None]:
# Export labels
labels_tensor = torch.tensor(labels.values)

In [None]:
# Put on device
labels_tensor = labels_tensor.to(device)

In [None]:
labels_tensor

In [None]:
import torch.optim as optim
#Create an stochastic gradient descent optimizer
adam = optim.Adam(net.parameters(), lr=0.001)
loss_func = nn.NLLLoss()
loss_func.to(device)

In [None]:
# Train network
cnt = 0
average_losses = []
average_val_losses = []
acc = []
cur_loss = []
min_validation = 10000.0
for epoch in range(1000):
    net.train()
    #zero the gradient
    adam.zero_grad()
    #Get output of network
    probs = net(train_features_tensor)
    #compute loss
    loss = loss_func(probs,train_labels_tensor)
    #compute the backward gradient and move network in that direction
    loss.backward()
    adam.step()
    #gather loss
    cur_loss.append(loss.detach().cpu().numpy())
    print("epoch ",epoch)
    print("training loss: ", np.mean(cur_loss))
    net.eval()
    probs_val = net(test_features_tensor)
    loss_val = loss_func(probs_val,test_labels_tensor)
    print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))
    #Save model if validation is min
    if min_validation > np.mean(loss_val.detach().cpu().numpy()):
        min_validation = np.mean(loss_val.detach().cpu().numpy())
        torch.save(net.state_dict(), './net_parameters_%d.pth' % epoch)
    

In [None]:
model = Shallow_Network()
checkpoint = torch.load('./net_parameters_147.pth')
model.load_state_dict(checkpoint)
model = model.to(device)
model.eval()

In [None]:
probs_val = model(val_features_tensor)
loss_val = loss_func(probs_val,val_labels_tensor)
print("validation loss: ", np.mean(loss_val.detach().cpu().numpy()))

In [None]:
# Get classification probabilities from hidden state array
# And apply Softmax
with torch.no_grad():
    probs = model(val_features_tensor)
    softprobs = F.softmax(probs)

In [None]:
len(softprobs)

In [None]:
# Get most likely class and its index for each sample point
values, indices = torch.max(softprobs,1)

In [None]:
# Predicted labels
indices

In [None]:
# Take original labels
labels_tensor

In [None]:
# Calculate number of sample points where prediction failed
nums = torch.sum(torch.abs(val_labels_tensor-indices)).detach().cpu().numpy()

In [None]:
# Number of correct predictions
numcorrect = 2000-(nums+0)

In [None]:
# Accuracy of prediction
accuracy = numcorrect/2000

In [None]:
accuracy

In [None]:
idx = np.asarray(indices.detach().cpu().numpy())
lbls = np.asarray(labels_tensor.detach().cpu().numpy())

In [None]:
new_df.iloc[np.where(idx-lbls)]

In [None]:
# Using test set
probs_test = model(test_features_tensor)
loss_test = loss_func(probs_test,test_labels_tensor)
print("test loss: ", np.mean(loss_test.detach().cpu().numpy()))

In [None]:
with torch.no_grad():
    probs_test = model(test_features_tensor)
    softprobs_test = F.softmax(probs_test)
# Get most likely class and its index for each sample point
test_values, test_indices = torch.max(softprobs_test,1)
# Calculate number of sample points where prediction failed
test_nums = torch.sum(torch.abs(test_labels_tensor-test_indices)).detach().cpu().numpy()
# Number of correct predictions
numcorrect = test_df.shape[0]-(test_nums+0)
# Accuracy of prediction
accuracy = numcorrect/test_df.shape[0]
print(accuracy)

In [None]:
test_df.shape