In [1]:
def load_data():
    reviewsFile = open('reviews.txt','r')
    reviews = list(map(lambda x:x[:-1],reviewsFile.readlines()))
    reviewsFile.close()

    labelsFile = open('labels.txt','r')
    labels = list(map(lambda x:x[:-1],labelsFile.readlines()))
    labelsFile.close()
    
    return reviews,labels

In [2]:
reviews,labels = load_data()

In [3]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("\w+\'?\w+|\w+")

In [4]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [5]:
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    "aren't",
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    "couldn't",
    'didn',
    "didn't",
    'doesn',
    "doesn't",
    'don',
    "don't",
    'done',
    'down',
    'except',
    'few',
    'hadn',
    "hadn't",
    'hasn',
    "hasn't",
    'haven',
    "haven't",
    'however',
    'isn',
    "isn't",
    'least',
    'mightn',
    "mightn't",
    'move',
    'much',
    'must',
    'mustn',
    "mustn't",
    'needn',
    "needn't",
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    "should've",
    'shouldn',
    "shouldn't",
    'too',
    'top',
    'up',
    'wasn',
    "wasn't",
    'well',
    'weren',
    "weren't",
    'won',
    "won't",
    'wouldn',
    "wouldn't",
}

In [7]:
stop_words = set(stop_words).union(STOP_WORDS)

In [8]:
final_stop_words = stop_words-exceptionStopWords

In [9]:
import spacy
nlp = spacy.load("en",disable=['parser', 'tagger', 'ner'])

In [10]:
def make_token(review):
    return tokenizer.tokenize(str(review))

In [11]:
def remove_stopwords(review):
    return [token for token in review if token not in final_stop_words]

In [12]:
def lemmatization(review):
    lemma_result = []
    
    for words in review:
        doc = nlp(words)
        for token in doc:
            lemma_result.append(token.lemma_)
    return lemma_result

In [13]:
def pipeline(review):
    review = make_token(review)
    review = remove_stopwords(review)
    return lemmatization(review)

In [14]:
%%time
reviews = list(map(lambda review: pipeline(review),reviews))

CPU times: user 34.5 s, sys: 102 ms, total: 34.6 s
Wall time: 34.6 s


In [15]:
from gensim.models import Word2Vec

In [16]:
emb_model = Word2Vec(reviews,size=100, min_count=3, window=5, workers=6)

In [17]:
reviews[:5]

[['bromwell',
  'high',
  'cartoon',
  'comedy',
  'run',
  'time',
  'program',
  'school',
  'life',
  'teacher',
  'year',
  'teach',
  'profession',
  'lead',
  'believe',
  'bromwell',
  'high',
  'satire',
  'much',
  'close',
  'reality',
  'teacher',
  'scramble',
  'survive',
  'financially',
  'insightful',
  'student',
  'right',
  'pathetic',
  'teacher',
  'pomp',
  'pettiness',
  'situation',
  'remind',
  'school',
  'know',
  'student',
  'see',
  'episode',
  'student',
  'repeatedly',
  'try',
  'burn',
  'down',
  'school',
  'immediately',
  'recall',
  'high',
  'classic',
  'line',
  'inspector',
  'sack',
  'teacher',
  'student',
  'welcome',
  'bromwell',
  'high',
  'expect',
  'adult',
  'age',
  'think',
  'bromwell',
  'high',
  'far',
  'fetch',
  'pity',
  'isn'],
 ['story',
  'man',
  'unnatural',
  'feeling',
  'pig',
  'start',
  'open',
  'scene',
  'terrific',
  'example',
  'absurd',
  'comedy',
  'formal',
  'orchestra',
  'audience',
  'turn',
  '

In [18]:
emb_model.wv.similar_by_word(word="good", topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('decent', 0.7074456810951233),
 ('darn', 0.6548200845718384),
 ('alright', 0.6462702751159668),
 ('great', 0.6457749605178833),
 ('nice', 0.6239773035049438)]

In [19]:
emb_model.wv.similar_by_word(word="bad", topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('suck', 0.7208997011184692),
 ('horrible', 0.7207365036010742),
 ('terrible', 0.714970588684082),
 ('awful', 0.6812218427658081),
 ('lame', 0.6796892881393433)]

In [20]:
emb_model.similar_by_word(word="be", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('would', 0.9116214513778687),
 ('have', 0.8634248971939087),
 ('-PRON-', 0.8267257213592529),
 ('eventy', 0.7624976634979248),
 ('s', 0.7046369314193726)]

In [21]:
emb_model.most_similar(positive=  "bad",topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('suck', 0.7208997011184692),
 ('horrible', 0.7207365036010742),
 ('terrible', 0.714970588684082),
 ('awful', 0.6812218427658081),
 ('lame', 0.6796892881393433)]

In [22]:
emb_model.similarity("good","bad")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.59436196

In [23]:
emb_model.similarity("good","be")

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


0.27517337

In [24]:
emb_model.accuracy

<bound method Word2Vec.accuracy of <gensim.models.word2vec.Word2Vec object at 0x7f2bda621be0>>

In [25]:
emb_model.similar_by_word(word="school", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('student', 0.7639281749725342),
 ('college', 0.7610238790512085),
 ('class', 0.7575020790100098),
 ('teacher', 0.7121667861938477),
 ('schoolers', 0.7093414664268494)]

In [26]:
emb_model.similar_by_word(word="comedy", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('satire', 0.6905634999275208),
 ('slapstick', 0.6813418865203857),
 ('humor', 0.666489839553833),
 ('parody', 0.6656045913696289),
 ('farce', 0.6495898962020874)]

In [27]:
emb_model.similar_by_word(word="action", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('suspense', 0.638068675994873),
 ('thrill', 0.6378527879714966),
 ('overabundance', 0.57851642370224),
 ('excitement', 0.5689365863800049),
 ('gory', 0.565312385559082)]

In [28]:
emb_model.similar_by_word(word="sad", topn=5)

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('depress', 0.746796727180481),
 ('cry', 0.7335566282272339),
 ('happy', 0.6977354884147644),
 ('touch', 0.664000391960144),
 ('honestly', 0.6391709446907043)]

In [29]:
def word2idx(review):
    index_review = []
    for word in review:
        try:
            index_review.append(emb_model.wv.vocab[word].index)
        except: 
             pass
    return torch.tensor(index_review)

In [30]:
max([len(review) for review in reviews])

1423

In [31]:
emb_model.wv.index2word[0]

'br'

In [126]:
len(emb_model.wv.index2word)

28166

In [127]:
padding_value = len(emb_model.wv.index2word)

In [33]:
review_lengths = [len(sentence) for sentence in reviews]

In [34]:
longest_review = max(review_lengths)

In [35]:
BATCH_SIZE = 128

In [36]:
import torch

In [37]:
import torch.nn as nn

In [38]:
torch.nn.utils.rnn.pad_sequence([torch.tensor([1,2,3]), torch.tensor([3,4])], batch_first=True, padding_value=6)

tensor([[1, 2, 3],
        [3, 4, 6]])

In [39]:
emb_model.wv.vectors.shape

(28166, 100)

In [40]:
weights = torch.FloatTensor(emb_model.wv.vectors)

In [41]:
weights.shape

torch.Size([28166, 100])

In [42]:
embedding = nn.Embedding.from_pretrained(weights)

In [43]:
embedding.weight.requires_grad

False

In [60]:
index_review = list(map(lambda review: word2idx(review),reviews))

In [61]:
type(index_review)

list

In [62]:
longest_review = max([len(review) for review in index_review])

In [63]:
longest_review

1400

In [64]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
#         self.embedding = nn.Embedding(input_dim,embedding_dim).from_pretrained(weights)
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        
    def forward(self, x):

        embedded = self.embedding(x)
        
        output, hidden = self.rnn(embedded)
                
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [65]:
review_lengths = [len(sentence) for sentence in index_review]

In [66]:
# index_review = torch.nn.utils.rnn.pad_sequence(index_review,batch_first=True)

In [67]:
review_lengths[:5]

[66, 58, 197, 414, 74]

In [68]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [69]:
device

device(type='cuda')

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
labels = [0 if label == 'negative' else 1 for label in labels ]

In [72]:
X_train, X_test, y_train, y_test = train_test_split(index_review, labels, test_size=0.2)

In [73]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [74]:
len(X_train)

16000

In [75]:
# y_train = torch.Tensor(y_train)
# y_val = torch.Tensor(y_val)
# y_test = torch.Tensor(y_test)

In [59]:
labels[:5]

[1, 0, 1, 0, 1]

In [76]:
# X_train = torch.nn.utils.rnn.pad_sequence(X_train,batch_first=True)
# X_val = torch.nn.utils.rnn.pad_sequence(X_val,batch_first=True)
# X_test = torch.nn.utils.rnn.pad_sequence(X_test,batch_first=True)

In [77]:
#torch.stack(X_train)

In [78]:
len(X_train[0]),len(X_train[1])

(92, 71)

In [79]:
INPUT_DIM = longest_review
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [80]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [81]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [82]:
criterion = nn.BCEWithLogitsLoss()

In [83]:
model = model.to(device)
criterion = criterion.to(device)

In [84]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [85]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        batch1 = {}
        
        
        
        batch1["text"] = batch["text"].t().cuda()
        
        optimizer.zero_grad()
                
        predictions = model(batch1["text"]).squeeze(1)
        
        
        
        loss = criterion(predictions, batch["label"].cuda())
        
        acc = binary_accuracy(predictions, batch["label"].cuda())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [86]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            batch1 = {}
            batch1["text"] = batch["text"].t().cuda()

            predictions = model(batch1["text"]).squeeze(1)
          
            loss = criterion(predictions, batch["label"].cuda())
            
            acc = binary_accuracy(predictions, batch["label"].cuda())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [87]:
# N_EPOCHS = 5

# for epoch in range(N_EPOCHS):

#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
#     print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

In [88]:
X_train

[tensor([  139,    36,    19,   534,    39,     1,   914,   508,    91,    19,
          1135,     0,     0,    42,   107,     2,    19,   552,  3013,    57,
           122,     9,     2,     0,     0,   245,  1806,   979,   835,     1,
          3013,   526,  5748, 14461,  5644,   147,    84,    92,     2,     0,
             0,     5,    72,   531,  1357,   140,   158,   742,   432,   275,
           344,     5,    30,   288,     2,    45,   100,     0,     0,     2,
           400,  1062,   697,  1054,     1,     6,    60,  3849,  3440,  2076,
             0,     0,    18,  3013,    71,   119,    35,    68,    82,    37,
           487,   608,   111,   242,    51,   178,    98,    61,  3013,  1135,
           295,     2]),
 tensor([    5,     1,  1247,    53,  1956,  1746,   341,    96,     1,   551,
             4,   292,    80,  1582,  1450,   402,   950,  3302,    28,    14,
            69,   101,     1,   322,   160,   608,    22,  6754,    41,     5,
            28,    22,    6

In [92]:
batch_size = 128 

def iterator_func(X,y):
    
    permutation = torch.randperm(X.size()[0])
    iterator = []
    for i in range(0,X.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch = {}
        batch['text'] = X_train[indices]
        batch["lengths"] = [len(review) for review in batch["text"]]
        batch['label'] = y_train[indices]
        
        #sorted
        batch["text"],batch["label"] = zip(*sorted(zip(batch["text"],batch["label"]),key = len(x[0]),reverse=True))
        
        batch["text"] = torch.Tensor(batch["text"])
        batch["label"] = torch.Tensor(batch["label"])
        batch["text"] = torch.nn.utils.rnn.pad_sequence(batch["text"],batch_first=True,padding_value=padding_value)
        
        
        
        iterator.append(batch)
    return iterator
        
train_iterator = iterator_func(X_train,y_train)
valid_iterator = iterator_func(X_val,y_val)
test_iterator = iterator_func(X_test,y_test)

In [93]:
len(valid_iterator)

32

In [94]:
model

RNN(
  (embedding): Embedding(28166, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [None]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')

orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of predicti

orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of predicti

| Epoch: 01 | Train Loss: 0.278 | Train Acc: 99.20% | Val. Loss: 0.115 | Val. Acc: 100.00% |
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of label:
torch.Size([128])
orignal shape of batch text:
torch.Size([128, 862])
shape of prediction:
torch.Size([128])
orignal shape of l

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')

In [109]:
a = [["a","b"],["f","s","s","g","s"],["a","b","c","d"],["a","s","d"]]
b = [0,1,0,0]

In [123]:
f,g = zip(*sorted(zip(a,b), key=lambda x: len(x[0]),reverse=True))

In [125]:
f

(['f', 's', 's', 'g', 's'], ['a', 'b', 'c', 'd'], ['a', 's', 'd'], ['a', 'b'])