In [1]:
def load_data():
    reviewsFile = open('reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [2]:
reviews,labels = load_data()

In [3]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [5]:
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [7]:
stop_words = set(stop_words).union(STOP_WORDS)

In [8]:
final_stop_words = stop_words-exceptionStopWords

In [9]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [10]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [11]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [12]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [13]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [14]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 35.9 s, sys: 62.8 ms, total: 36 s
Wall time: 36.1 s


In [15]:
from gensim.models import Word2Vec

In [16]:
emb_model = Word2Vec(reviews,size=100, min_count=3, window=5, workers=6)

In [17]:
reviews[:5]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [18]:
emb_model.wv.similar_by_word(word="good", topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('decent', 0.7010826468467712),
 ('alright', 0.6637446284294128),
 ('great', 0.654833197593689),
 ('nice', 0.6515093445777893),
 ('excellent', 0.6245449781417847)]

In [19]:
emb_model.wv.similar_by_word(word="bad", topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.7174395322799683),
 ('horrible', 0.7109150290489197),
 ('suck', 0.7039785385131836),
 ('awful', 0.6831482648849487),
 ('lousy', 0.6812159419059753)]

In [20]:
emb_model.similar_by_word(word="be", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('would', 0.904498815536499),
 ('eventy', 0.8563498854637146),
 ('have', 0.840903639793396),
 ('-PRON-', 0.7821750640869141),
 ('s', 0.7486315965652466)]

In [21]:
emb_model.most_similar(positive=  "bad",topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('terrible', 0.7174395322799683),
 ('horrible', 0.7109150290489197),
 ('suck', 0.7039785385131836),
 ('awful', 0.6831482648849487),
 ('lousy', 0.6812159419059753)]

In [22]:
emb_model.similarity("good","bad")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.5900896

In [23]:
emb_model.similarity("good","be")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.27340782

In [24]:
emb_model.accuracy

<bound method Word2Vec.accuracy of <gensim.models.word2vec.Word2Vec object at 0x7f5683f8e208>>

In [25]:
emb_model.similar_by_word(word="school", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('college', 0.7996352910995483),
 ('class', 0.7899004220962524),
 ('student', 0.7800016403198242),
 ('schooler', 0.7695136070251465),
 ('teacher', 0.7596186995506287)]

In [26]:
emb_model.similar_by_word(word="comedy", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('slapstick', 0.7126612663269043),
 ('satire', 0.7032158374786377),
 ('farce', 0.6686745882034302),
 ('parody', 0.6662254333496094),
 ('humor', 0.659766435623169)]

In [27]:
emb_model.similar_by_word(word="action", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('suspense', 0.6356149911880493),
 ('thrill', 0.6329518556594849),
 ('pace', 0.5751595497131348),
 ('choreograph', 0.5640361309051514),
 ('tense', 0.5584642291069031)]

In [28]:
emb_model.similar_by_word(word="sad", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('depress', 0.7577664852142334),
 ('cry', 0.7323997616767883),
 ('honest', 0.665052056312561),
 ('happy', 0.6612464785575867),
 ('heartwarming', 0.6591799855232239)]

In [29]:
def word2idx(review):
    index_review = []
    for word in review:
        try:
            index_review.append(emb_model.wv.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [30]:
max([len(review) for review in reviews])

1423

In [31]:
emb_model.wv.index2word[0]

'br'

In [74]:
padding_value = len(emb_model.wv.index2word)

In [33]:
review_lengths = [len(sentence) for sentence in reviews]

In [34]:
longest_review = max(review_lengths)

In [35]:
BATCH_SIZE = 128

In [36]:
import torch

In [37]:
import torch.nn as nn

In [38]:
torch.nn.utils.rnn.pad_sequence([torch.tensor([1,2,3]), torch.tensor([3,4])], batch_first=True, padding_value=6)

tensor([[1, 2, 3],
        [3, 4, 6]])

In [39]:
emb_model.wv.vectors.shape

(28166, 100)

In [40]:
weights = torch.FloatTensor(emb_model.wv.vectors)

In [41]:
weights.shape

torch.Size([28166, 100])

In [42]:
embedding = nn.Embedding.from_pretrained(weights)

In [43]:
embedding.weight.requires_grad

False

In [44]:
index_review = list(map(lambda review: word2idx(review),reviews))

In [45]:
longest_review = max([len(review) for review in index_review])

In [46]:
longest_review

1400

In [47]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(x))
        
        #embedded = [sent len, batch size, emb dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim * num directions]
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden.squeeze(0))

In [48]:
review_lengths = [len(sentence) for sentence in index_review]

In [49]:
# index_review = torch.nn.utils.rnn.pad_sequence(index_review,batch_first=True)

In [50]:
review_lengths[:5]

[66, 58, 197, 414, 74]

In [51]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [52]:
device

device(type='cuda')

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
labels = [0 if label == 'negative' else 1 for label in labels ]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(index_review, labels, test_size=0.2)

In [56]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [57]:
len(X_train)

16000

In [58]:
y_train = torch.Tensor(y_train)
y_val = torch.Tensor(y_val)
y_test = torch.Tensor(y_test)

In [59]:
labels[:5]

[1, 0, 1, 0, 1]

In [60]:
X_train = torch.nn.utils.rnn.pad_sequence(X_train,batch_first=True)
X_val = torch.nn.utils.rnn.pad_sequence(X_val,batch_first=True)
X_test = torch.nn.utils.rnn.pad_sequence(X_test,batch_first=True)

In [61]:
#torch.stack(X_train)

In [62]:
len(X_train[0]),len(X_train[1])

(893, 893)

In [63]:
INPUT_DIM = longest_review
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

In [64]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [65]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [66]:
criterion = nn.BCEWithLogitsLoss()

In [67]:
model = model.to(device)
criterion = criterion.to(device)

In [68]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [79]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        
        optimizer.zero_grad()
                
        predictions = model(batch["text"]).squeeze(1)
        
        
        loss = criterion(predictions, batch["label"])
        
        acc = binary_accuracy(predictions, batch["label"].cuda())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# def train(model, iterator, optimizer, criterion):
    
#     epoch_loss = 0
#     epoch_acc = 0
    
#     model.train()
    
#     for batch in iterator:
        
#         batch1 = {}
        
#         batch1["text"] = batch["text"].t().cuda()
        
#         optimizer.zero_grad()
                
#         predictions = model(batch1["text"]).squeeze(1)
        
# #         print(predictions.shape)
# #         print(batch["label"].shape)
        
#         loss = criterion(predictions, batch["label"].cuda())
        
#         acc = binary_accuracy(predictions, batch["label"].cuda())
        
#         loss.backward()
        
#         optimizer.step()
        
#         epoch_loss += loss.item()
#         epoch_acc += acc.item()
        
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [80]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            
            predictions = model(batch["text"]).squeeze(1)
          
            loss = criterion(predictions, batch["label"])
            
            acc = binary_accuracy(predictions, batch["label"])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# def evaluate(model, iterator, criterion):
    
#     epoch_loss = 0
#     epoch_acc = 0
    
#     model.eval()
    
#     with torch.no_grad():
    
#         for batch in iterator:
#             batch1 = {}
#             batch1["text"] = batch["text"].t().cuda()

#             predictions = model(batch1["text"]).squeeze(1)
          
#             loss = criterion(predictions, batch["label"].cuda())
            
#             acc = binary_accuracy(predictions, batch["label"].cuda())

#             epoch_loss += loss.item()
#             epoch_acc += acc.item()
        
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [71]:
# N_EPOCHS = 5

# for epoch in range(N_EPOCHS):

#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
#     print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

In [72]:
X_train

tensor([[  55,  250,   18,  ...,    0,    0,    0],
        [   2,  524,  856,  ...,    0,    0,    0],
        [ 592,    2,   51,  ...,    0,    0,    0],
        ...,
        [  28,  136,  264,  ...,    0,    0,    0],
        [ 467,  467,  549,  ...,    0,    0,    0],
        [ 501, 2136,  598,  ...,    0,    0,    0]])

In [75]:
batch_size = 128 
import numpy as np

def iterator_func(X,y):
    
    size = len(X)
    
    permutation = np.random.permutation(size)
    iterator = []
    
    for i in range(0,size, batch_size):
        indices = permutation[i:i+batch_size]
        
        batch = {}
        batch['text'] = [X[i] for i in indices]
        batch["lengths"] = [len(review) for review in batch["text"]]
        batch['label'] = [y[i] for i in indices]
        
        #sorted
        batch["text"],batch["label"] = zip(*sorted(zip(batch["text"],batch["label"]),key=lambda x: len(x[0]),reverse=True))

        
        batch["text"] = torch.nn.utils.rnn.pad_sequence(batch["text"],batch_first=True,padding_value=padding_value).t().cuda()
        batch["label"] = torch.Tensor(batch["label"]).cuda()
        
        
        
        iterator.append(batch)
    return iterator
        
train_iterator = iterator_func(X_train,y_train)
valid_iterator = iterator_func(X_val,y_val)
test_iterator = iterator_func(X_test,y_test)



# batch_size = 128 

# def iterator_func(X,y):
#     permutation = torch.randperm(X.size()[0])
#     iterator = []
#     for i in range(0,X.size()[0], batch_size):
#         indices = permutation[i:i+batch_size]
#         batch = {}
#         batch['text'] = X[indices]
#         batch['label'] = y[indices]
#         iterator.append(batch)
#     return iterator
        
# train_iterator = iterator_func(X_train,y_train)
# valid_iterator = iterator_func(X_val,y_val)
# test_iterator = iterator_func(X_test,y_test)


In [76]:
len(valid_iterator)

32

In [77]:
model

RNN(
  (embedding): Embedding(28166, 100)
  (rnn): LSTM(100, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
)

In [81]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

| Epoch: 01 | Train Loss: 0.631 | Train Acc: 63.48% | Val. Loss: 0.679 | Val. Acc: 60.28% |
| Epoch: 02 | Train Loss: 0.473 | Train Acc: 78.33% | Val. Loss: 0.373 | Val. Acc: 83.86% |
| Epoch: 03 | Train Loss: 0.398 | Train Acc: 82.40% | Val. Loss: 0.351 | Val. Acc: 85.52% |
| Epoch: 04 | Train Loss: 0.369 | Train Acc: 84.11% | Val. Loss: 0.339 | Val. Acc: 85.79% |
| Epoch: 05 | Train Loss: 0.350 | Train Acc: 85.09% | Val. Loss: 0.324 | Val. Acc: 86.74% |


In [82]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

| Test Loss: 0.316 | Test Acc: 86.64% |
