In [1]:
from loader import getLoader, path_to_train, path_to_test,loader_batch_size_train,loader_batch_size_test, path_to_slack,loader_batch_size_slack

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import spacy
import re


device = torch.device('cpu')

In [2]:
skip_training = True

## The model

In [3]:
class Classifier(nn.Module):
    def __init__(self, dictionary_size, hidden_size, output_size=2):
        super(Classifier, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(dictionary_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.linear = nn.Linear(hidden_size,output_size)

    def forward(self, pad_seqs, seq_lengths, hidden):
        
        batch_size = pad_seqs.shape[1]
        embedded = self.embedding(pad_seqs).view(pad_seqs.shape[0], pad_seqs.shape[1], -1)
        packed = pack_padded_sequence(embedded, seq_lengths, batch_first = False)
        self.lstm.flatten_parameters()
        _,hidden = self.lstm(packed)
        fc = self.linear(hidden[0])

        return fc

    def init_hidden(self, batch_size=1, device=device):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)

## Train the model

In [4]:
hidden_size = 128
dictionary_size = 20000
classifier = Classifier(dictionary_size, hidden_size).to(device)

In [5]:
trainloader = getLoader(train=True,mini=False)

In [6]:
n_epochs = 10

classifier_optimizer = optim.Adam(classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [7]:
for epoch in range(n_epochs):
    running_loss = 0.0
    print_every = 200 
    print("Epoch", epoch+1)
    
    for i, batch in enumerate(trainloader):
        classifier_optimizer.zero_grad()
        pad_input_seqs, input_seq_lengths, target_seqs = batch
        batch_size = pad_input_seqs.size(1)
        pad_input_seqs, target_seqs = pad_input_seqs.to(device), target_seqs.to(device)
        classifier_hidden = classifier.init_hidden(batch_size, device)
        classifier_hidden = classifier(pad_input_seqs, input_seq_lengths, classifier_hidden)
        loss = criterion(classifier_hidden.view(batch_size,2), target_seqs)
        loss.backward()
        
        classifier_optimizer.step()
        running_loss += loss.item()
        if (i % print_every) == (print_every-1) or i == (len(trainloader) // trainloader.batch_size):
            print('[%d, %5d] loss: %.4f' % (epoch+1, i+1, running_loss/print_every))
            running_loss = 0.0

        if skip_training:
            break
    if skip_training:
        break

print('Finished Training')

Epoch 1
Finished Training


## Save/load the model

In [8]:
classifier_filename = 'classifier_model.pth'
if not skip_training:
    try:
        torch.save(classifier.state_dict(), classifier_filename)
        print('Model saved to %s' % (classifier_filename))

    except:
        pass
else:
    classifier = Classifier(dictionary_size, hidden_size)
    classifier.load_state_dict(torch.load(classifier_filename, map_location=lambda storage, loc: storage))
    print('Classifier loaded from %s.' % classifier_filename)
    classifier = classifier.to(device)
    classifier.eval()

Classifier loaded from classifier_model.pth.


## Calculate the accuracy of the model

In [9]:
testloader = getLoader(train=False,mini=False)


In [10]:
def compute_accuracy(classifier, testloader, print_every):
    classifier.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for i,(pad_input_seqs, input_seq_lengths, targets, _) in enumerate(testloader):
            batch_size = pad_input_seqs.size(1)
            
            pad_input_seqs, targets = pad_input_seqs.to(device), targets.to(device)
            
            init_hidden = classifier.init_hidden(batch_size, device)
            output = classifier(pad_input_seqs, input_seq_lengths, init_hidden)
            
            out_flat = output.detach().numpy().argmax(axis=2)
            predicted = torch.tensor(out_flat)
            
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

            if (i % print_every == 0):
                print("Counted:",total,"accuracy",correct / total)
    return correct / total

In [15]:
print("Final train accuracy:",compute_accuracy(classifier,trainloader,500),"\n")
print("Final test accuracy:",compute_accuracy(classifier,testloader,10000))

Train:Counted: 512 accuracy 0.908203125
Counted: 512512 accuracy 0.9233754526723277
Counted: 1024512 accuracy 0.9225338502623688
Final train accuracy: 0.92265078125

Test:Counted: 100005 accuracy 0.8283485825708714
Counted: 200005 accuracy 0.8281042973925652
Counted: 300005 accuracy 0.827819536341061
Final test accuracy: 0.827578125


## Get the distribution of positive and negative tweets

In [9]:
slackloader = getLoader(train=False,mini=False, slack=True)

In [10]:
word_df = pd.read_csv("data/words.csv",squeeze=True)
index_word = {x:y for x,y in enumerate(word_df["0"])}
word_index = {y:x for x,y in enumerate(word_df["0"])}

In [17]:
def evaluate_slack(classifier, testloader, print_every):
    classifier.eval()
    negative = 0
    positive = 0
    total = 0
    pos_scores = {}
    neg_scores = {}
    
    with torch.no_grad():
        for i, (pad_input_seqs, input_seq_lengths, targets) in enumerate(testloader):
            batch_size = pad_input_seqs.size(1)
            
            pad_input_seqs = pad_input_seqs.to(device)
            
            init_hidden = classifier.init_hidden(batch_size, device)
            output = classifier(pad_input_seqs, input_seq_lengths, init_hidden)
            
            out_flat = output.detach().numpy().argmax(axis=2)
            predicted = torch.tensor(out_flat)
            
            
            total += targets.size(0)
            positive += (predicted == 1).sum().item()
            negative += (predicted == 0).sum().item()

            pos_scores.update({i:output.numpy().flatten()[1]})
            neg_scores.update({i:output.numpy().flatten()[0]})
            
            if (total % print_every == 0):
                print("Counted:",total,"positive",positive / total,"negative",negative / total)
    return positive, negative, total, pos_scores, neg_scores

In [18]:
pos_count,neg_count,total, pos_scores, neg_scores = evaluate_slack(classifier,slackloader,200)

Counted: 200 positive 0.545 negative 0.455
Counted: 400 positive 0.5275 negative 0.4725
Counted: 600 positive 0.505 negative 0.495
Counted: 800 positive 0.50125 negative 0.49875
Counted: 1000 positive 0.504 negative 0.496


In [19]:
pos_scores = sorted(pos_scores.items(), key=lambda s: -s[1])
neg_scores = sorted(neg_scores.items(), key=lambda s: -s[1])
listed = list(slackloader)

## List the top positive tweets

In [20]:
for i,top in enumerate(pos_scores[:5]):
    print("Top Positive sentence",i+1)
    sentence = listed[top[0]]
    for word in sentence[0].numpy().flatten():
        if (word != 2 and word != 3 and word != 0):
            print(index_word[word],end=" ")
    print("\n")       

Top Positive sentence 1
set the channel purpose for finding project partners 

Top Positive sentence 2
yes the exercise sessions . thanks ! 

Top Positive sentence 3
in CTX the type torch CTX is actually float doing your CTX CTX will convert torch CTX to torch CTX 

Top Positive sentence 4
and you can avoid having to CTX the CTX every time you pass a data sample by CTX the data into CTX and avoid extra CTX this way . that s why i said that a simple matrix CTX will do this trick for you if you think how matrix CTX works you can see that it exactly does this combination by adding the individual CTX for every data example . 

Top Positive sentence 5
in . CTX in the return value description is returns CTX CTX CTX of the CTX shape CTX length hidden size . hidden CTX new state of the CTX shape batch size hidden size with batch size . 



## List the top negative tweets

In [21]:
for i,top in enumerate(neg_scores[:5]):
    print("Top Negative sentence",i+1)
    sentence = listed[top[0]]
    for word in sentence[0].numpy().flatten():
        if (word != 2 and word != 3 and word != 0):
            print(index_word[word],end=" ")
    print("\n")        

Top Negative sentence 1
are there any tutorial sessions this week ? the booking option is not available on CTX edited 

Top Negative sentence 2
when i fetch data and click on it it says empty . i can t seem to download the data . 

Top Negative sentence 3
class CTX nn CTX def init self dictionary size hidden size super CTX self . init self CTX size hidden size self CTX nn CTX dictionary size hidden size self CTX nn CTX hidden size hidden size def forward self pad CTX CTX CTX hidden CTX pad CTX CTX max CTX length batch size CTX CTX list of sequence CTX hidden CTX batch size hidden size returns CTX CTX max CTX length batch size hidden size hidden CTX batch size hidden size your code here pad CTX self CTX pad CTX packed sequence pack CTX sequence pad CTX CTX CTX batch first false input CTX packed sequence CTX in length input CTX CTX CTX collect CTX CTX at different processing steps in this list el for batch size in packed sequence CTX sizes for i in range batch size CTX input CTX el CTX C

## Test your own sentences

In [22]:
word_df = pd.read_csv("data/words.csv",squeeze=True)
index_word = {x:y for x,y in enumerate(word_df["0"])}
word_index = {y:x for x,y in enumerate(word_df["0"])}
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])

def indices_func(sentence):
    indices = [[[2]]]
    for word in nlp(cleanString(sentence)):
        try:
            indices.append([[word_index[word.text.lower()]]])
        except:
            indices.append([[1]])
            
    indices.append([[3]])
    return indices

def cleanString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def replace(x):
    if (x==4): 
        return 1 
    else: 
        return x

In [36]:
sentence = ""
while (sentence != "exit"):
    sentence = input('What is your sentence? (type "exit" to quit) \n').lower()
    if (sentence == "exit"):
        break
    indices = indices_func(cleanString(sentence))
    indices_tensor = torch.tensor(indices)
    lens = torch.tensor([len(indices)])
    init_hidden = classifier.init_hidden(1, device)
    output = classifier(indices_tensor, lens, init_hidden)
    final = output.detach().numpy().flatten()
    print("Sentiment:")
    if (final[0]>final[1]):
        print("Negative \n")
    else:
        print("Positive \n")

What is your sentence? (type "exit" to quit) 
it was a good day
Sentiment:
Positive 

What is your sentence? (type "exit" to quit) 
today could have been better
Sentiment:
Negative 

What is your sentence? (type "exit" to quit) 
today could not have been more perfect
Sentiment:
Negative 

What is your sentence? (type "exit" to quit) 
the above result is surprising but not in a good way
Sentiment:
Negative 

What is your sentence? (type "exit" to quit) 
i am fairly positive we can improve the accuracy
Sentiment:
Positive 

What is your sentence? (type "exit" to quit) 
perfect
Sentiment:
Positive 

What is your sentence? (type "exit" to quit) 
exit
