# PyTorch for Natural Language Processing

**Dataset**: Movie Reviews (sentiment analysis).

http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

In [1]:
from data import load

positive_reviews = load('pos')
negative_reviews = load('neg')

In [2]:
print(negative_reviews[0])


 " america's sweethearts " has an intriguing premise and a great cast , but it isn't nearly as edgy or funny as it should be . 
almost all the problems with the project can be traced back to co-script writer billy crystal , who shows the same lack of discipline with the screenplay that he typically displays while co-hosting " comic relief " charity shows with robin williams and whoopi goldberg ( two other paragons of self-indulgence ) . 
crystal ignores a simple , but crucial , rule : for a screwball comedy to work , the characters must be placed into a rigid social setting , because only in that context will their unorthodox antics be humorous . 
 " america's sweethearts " takes place at a press junket , where decorum must be maintained in front of the reporters . 
it's a promising set-up , but the screenplay quickly blows off the rules , thus dissipating the tension of the situation . 
by the end of the film , all the lead performers participate in a huge fight with a room full of jo

## Tokenization

we perform a simple tokenization step using a regex tokenizer

In [3]:
import re

def lowercased(reviews):
    pattern = re.compile('[\W]+', re.UNICODE)
    for doc in reviews:
        yield pattern.sub(r' ', str(doc).replace('\n', ' ').lower())
    

In [4]:
parsed_pos = lowercased(positive_reviews)
parsed_neg = lowercased(negative_reviews)

The extracted information will be added to a pandas dataframe

In [5]:
def get_dataframe(pos, neg):
    import pandas as pd
    docs = []
    for doc in pos:
        docs.append([doc, 1])
    for doc in neg:
        docs.append([doc, 0])
    return pd.DataFrame(docs, columns=['text', 'label'])

careful that the function lowercased is a generator and not a list!

In [6]:
dataset = get_dataframe(parsed_pos, parsed_neg)

In [7]:
dataset.head(5)

Unnamed: 0,text,label
0,sometimes a movie comes along that falls somew...,1
1,i swear i have seen the edge before in fact it...,1
2,with the abundance of trite recycled movies in...,1
3,contact is a film that tries to do several dif...,1
4,expectation rating a bit worse than expected m...,1


## Vocabulary

To initialize the network we to get a vocabulary in advance:

In [8]:
def get_vocab(df):
    vocab = set()
    for doc in df['text']:
        vocab |= set(doc.split())
    return vocab

In [9]:
vocab = get_vocab(dataset)
len(vocab)

39696

To perform a more efficient indexing of the embeddings we map each word to a unique id

In [10]:
word2id = {}
for i, word in enumerate(vocab):
    word2id[word] = i
word2id['the']

10037

Additionally we need to add two special tokens UNK and PAD

In [11]:
word2id = {'PAD':0, 'UNK':1}
for i, word in enumerate(vocab, 2):
    word2id[word] = i
word2id['the']

10039

In [12]:
def create_w2id(vocab):
    w2id = {'PAD':0, 'UNK':1}
    for i, word in enumerate(vocab, 2):
        w2id[word] = i
    return w2id

In [13]:
vocab =  get_vocab(dataset)
word2id = create_w2id(vocab)

len(word2id)

39698

Finally we create a function to convert a document to a vector of unique ids

In [14]:
def doc2ids(doc, word2id):
    return list(map(lambda x: word2id.get(x, word2id['UNK']), doc.split()))

And convert our dataset

In [15]:
dataset['X'] = dataset['text'].apply(lambda x :doc2ids(x, word2id))

In [16]:
dataset.head(5)

Unnamed: 0,text,label,X
0,sometimes a movie comes along that falls somew...,1,"[18709, 6119, 4976, 19592, 13979, 21782, 15443..."
1,i swear i have seen the edge before in fact it...,1,"[1943, 16863, 1943, 14122, 8610, 10039, 37531,..."
2,with the abundance of trite recycled movies in...,1,"[4362, 10039, 29077, 28356, 20400, 19701, 6340..."
3,contact is a film that tries to do several dif...,1,"[38731, 21804, 6119, 6444, 21782, 22489, 3285,..."
4,expectation rating a bit worse than expected m...,1,"[14067, 37634, 6119, 22692, 29894, 35667, 4642..."


## The network

First of all we create train and test splits:

In [17]:
from sklearn.model_selection import train_test_split
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(dataset['X'], dataset['label'], test_size=0.2, random_state=42, stratify=dataset['label'])
df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(df_X_train, df_y_train, test_size=0.2, random_state=42, stratify=df_y_train)

we import the required libraries

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F


The first network that we are going to define perform the average of word embeddings and learns a logistic regression:

- **Embedding layer** (mapping the list of ids to an embedding matrix)
- **Global Average Pooling** (we perform the mean of word embedding over the rows)
- A **logistic function** at the output (A linear layer followed by a sigmoid non linearity)

In [36]:
class AVGNet(nn.Module):
    
    def __init__(self, emb_dim, nb_emb):
        super(AVGNet, self).__init__()
        self.emb = nn.Embedding(nb_emb, emb_dim)
        self.out = nn.Linear(emb_dim, 1)
        
    def forward(self, x):
        embs = self.emb(x)
        pool = embs.mean(dim=1)
        out  = self.out(pool)
        out  = F.sigmoid(out)
        return out

In [37]:
mdl = AVGNet(5, len(word2id))
print(mdl)

AVGNet(
  (emb): Embedding(39698, 5)
  (out): Linear(in_features=5, out_features=1, bias=True)
)


In [33]:
sent = 'the film is horrible'

In [34]:
fake_sent = torch.LongTensor([doc2ids(sent, word2id)])
fake_sent.size()

torch.Size([1, 4])

In [35]:
mdl(fake_sent)

tensor([[[-0.8750,  0.0296,  0.2891, -0.3777,  0.3439],
         [-0.0120, -0.9149,  0.2452,  1.5830, -0.7774],
         [-0.0546,  0.0819, -0.8250,  1.1426,  0.8698],
         [-0.0499, -0.1935, -0.5581, -0.8661,  0.8754]]])
tensor([[-0.2479, -0.2492, -0.2122,  0.3705,  0.3279]])
tensor([[-0.3732]])


tensor([[ 0.4078]])

In [28]:
def pad_batch(batch):
    def pad(sent, max_len):
        diff = max_len - len(sent)
        return ([0]*diff)+sent
    max_len = max(len(sent) for sent in batch)
    return list(map(lambda x: pad(x, max_len), batch))

In [29]:
def batched_dataset(X, Y, batch_size=32):
    b_x, b_y = [], []
    for i, (x, y) in enumerate(zip(X, Y)):
        if i%batch_size == batch_size-1:
            yield torch.LongTensor(pad_batch(b_x)), torch.FloatTensor(b_y)
            b_x, b_y = [], []
        b_x.append(x)
        b_y.append([y])
    yield torch.LongTensor(pad_batch(b_x)), torch.FloatTensor(b_y)

In [30]:
mdl = AVGNet(300, len(word2id))
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.01)
print(mdl)


import time 
start = time.time()
for epoch in range(10):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x)
        loss = criterion(out, y)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('{:.3f}'.format(running_loss/i))
print('{:.2f} sec'.format(time.time()-start))

AVGNet(
  (emb): Embedding(39698, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
)
0.761
0.720
0.605
0.462
0.325
0.214
0.134
0.110
0.062
0.050
81.49 sec


In [39]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
mdl = AVGNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.01)
print(mdl)


import time 
start = time.time()
for epoch in range(10):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('{:.3f}'.format(running_loss/i))
print('{:.2f} sec'.format(time.time()-start))

AVGNet(
  (emb): Embedding(39698, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
)
0.769
0.719
0.600
0.472
0.339
0.225
0.140
0.111
0.066
0.051
10.16 sec


In [41]:
mdl(fake_sent.to(device))

tensor(1.00000e-04 *
       [[ 3.4061]], device='cuda:0')

In [42]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)

In [43]:
pred = mdl(test_X)

In [46]:
out = pred.to('cpu').data.numpy()
out

array([[2.45975126e-02],
       [2.01893374e-02],
       [9.89087462e-01],
       [8.32840621e-01],
       [7.72233009e-01],
       [9.94890809e-01],
       [7.89847136e-01],
       [3.60090025e-02],
       [2.04081565e-01],
       [9.88839447e-01],
       [2.20005840e-01],
       [8.73324990e-01],
       [4.56796378e-01],
       [2.97514498e-01],
       [3.45307216e-02],
       [1.05342567e-01],
       [2.09574178e-01],
       [9.10596550e-02],
       [9.94313598e-01],
       [8.43070924e-01],
       [9.10653412e-01],
       [3.88665423e-02],
       [7.71574676e-01],
       [9.06597972e-01],
       [4.43765432e-01],
       [2.90626466e-01],
       [9.29496288e-01],
       [9.05572116e-01],
       [2.24347413e-01],
       [9.89174962e-01],
       [2.31569320e-01],
       [8.40184152e-01],
       [3.78746510e-01],
       [1.37483940e-01],
       [2.35652570e-02],
       [9.73054886e-01],
       [9.76708889e-01],
       [9.95333850e-01],
       [7.86099769e-03],
       [7.91313887e-01],


In [47]:
from sklearn.metrics import accuracy_score

In [48]:
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.86

In [57]:
class CNNNet(nn.Module):
    
    def __init__(self, emb_dim, vocab_len):
        super(CNNNet, self).__init__()
        self.emb = nn.Embedding(embedding_dim=emb_dim, num_embeddings=vocab_len)
        self.conv = nn.Conv1d(kernel_size=5, out_channels=100, in_channels=emb_dim)
        self.out = nn.Linear(100, 1)
        
    def forward(self, x):
        emb = self.emb(x)
        conv_out = self.conv(emb.transpose(1, 2))
        h1 = F.relu(F.max_pool1d(conv_out, conv_out.size(2))).squeeze(2)
        return F.sigmoid(self.out(h1))

In [58]:
mdl = CNNNet(5, len(word2id))
mdl(fake_sent)

RuntimeError: Calculated padded input size per channel: (1 x 4). Kernel size: (1 x 5). Kernel size can't greater than actual input size at /pytorch/aten/src/THNN/generic/SpatialConvolutionMM.c:48

In [59]:
mdl = CNNNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.005)
print(mdl)


import time 
start = time.time()
for epoch in range(10):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('{:.3f}'.format(running_loss/i))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)
0.748
0.416
0.100
0.031
0.061
0.062
0.024
0.017
0.001
0.000
16.03 sec


In [60]:
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.78

In [43]:
df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(df_X_train, df_y_train, test_size=0.2, random_state=42, stratify=df_y_train)

In [44]:
mdl = CNNNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.001)
print(mdl)


import time 
start = time.time()
for epoch in range(10):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    print('{:.3f}'.format(running_loss/(i%30+1)))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)
7.968
5.193
3.413
2.249
1.725
1.478
1.265
1.192
0.332
0.190
46.80 sec


In [45]:
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.7625

In [62]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/i, best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


0.698  0.627 0.627
0.454  0.627 0.662
0.308  0.627 0.825
0.263  0.554 0.554
0.175  0.554 0.578
0.087  0.547 0.547
0.048  0.547 0.578
0.036  0.529 0.529
0.023  0.529 0.543
0.018  0.529 0.537
0.015  0.529 0.540
20.61 sec


In [63]:
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.77

In [64]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = AVGNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.01)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/i, best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

AVGNet(
  (emb): Embedding(39698, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


0.757  0.813 0.813
0.705  0.665 0.665
0.566  0.665 0.665
0.433  0.604 0.604
0.304  0.473 0.473
0.191  0.424 0.424
0.125  0.402 0.402
0.096  0.402 0.408
0.057  0.377 0.377
0.046  0.377 0.379
0.037  0.367 0.367
0.035  0.367 0.370
0.033  0.367 0.410
0.037  0.367 0.492
15.91 sec


In [65]:
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.86

In [66]:
from gensim.models import KeyedVectors
w2v = KeyedVectors.load_word2vec_format('/media/dbonadiman/Data linux/embs/glove.txt', binary=False)

In [67]:
set(w2v.vocab)

{'2009Covidien',
 '11981',
 'Hindsboro',
 '700m2',
 'Lindsanity',
 'Streetmate',
 'small-diced',
 '90,000-square-foot',
 'Buchannan',
 'fair-lending',
 'surfjac',
 'IZAR',
 'KRS1',
 'aagain',
 'Moriwaki',
 'perfromed',
 'IIINew',
 'Bittenbender',
 'out-of-step',
 'NataliePace.com',
 'NYDailyNews',
 'Gsm',
 'grabbag',
 'ients',
 'pic_only',
 'Signifyin',
 'D7000',
 'non-Hindi',
 'Centercutting',
 '16740',
 'Buñol',
 'Qto',
 'Prehosp',
 'Creamware',
 'Muljat',
 'HunterThe',
 'Cyniquian',
 'Gerbeaud',
 'meloncholy',
 '45oz',
 'dfi',
 'Shotshell',
 '18:47:32',
 '05:07:49',
 '1,2009',
 '14-room',
 'Henok',
 'SpecialsParts',
 'member?Register',
 'reporter/editor',
 'Sopar',
 'APOA',
 'Lumpkin',
 'mannan',
 'Habenero',
 'CENTRALE',
 'schmitty',
 'POOLED',
 'caved-in',
 'LANforge',
 '12/19/2002',
 '14:36:09',
 'annexing',
 'hisses',
 'Candaras',
 'SAFM',
 '1994,1995',
 'Lintu',
 'Montanelli',
 'prodige',
 'cold-season',
 '-2266',
 'feet-first',
 'somethingPoliticslearn',
 'Erysimum',
 'Z-SLCT'

In [68]:
def get_vocab(df, w2vec=None):
    vocab = set()
    for doc in df['text']:
        vocab |= set(doc.split())
    if w2vec:
        vocab &= set(w2vec)
    return vocab

r_vocab =  get_vocab(dataset, w2v.vocab)
r_word2id = create_w2id(r_vocab)

In [69]:
len(r_word2id)

36521

In [70]:
dataset['X_r'] = dataset['text'].apply(lambda x :doc2ids(x, r_word2id))

In [71]:
dataset.head(5)

Unnamed: 0,text,label,X,X_r
0,sometimes a movie comes along that falls somew...,1,"[18709, 6119, 4976, 19592, 13979, 21782, 15443...","[26927, 2808, 20503, 27313, 24671, 9965, 7061,..."
1,i swear i have seen the edge before in fact it...,1,"[1943, 16863, 1943, 14122, 8610, 10039, 37531,...","[871, 26057, 871, 6489, 3966, 4611, 35554, 194..."
2,with the abundance of trite recycled movies in...,1,"[4362, 10039, 29077, 28356, 20400, 19701, 6340...","[20238, 4611, 31679, 31335, 27663, 27362, 2110..."
3,contact is a film that tries to do several dif...,1,"[38731, 21804, 6119, 6444, 21782, 22489, 3285,...","[36085, 9976, 2808, 21147, 9965, 28641, 19723,..."
4,expectation rating a bit worse than expected m...,1,"[14067, 37634, 6119, 22692, 29894, 35667, 4642...","[24711, 35601, 2808, 28736, 32077, 16306, 2035..."


In [72]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(dataset['X_r'], dataset['label'], test_size=0.2, random_state=42, stratify=dataset['label'])

In [73]:
df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(df_X_train, df_y_train, test_size=0.2, random_state=42, stratify=df_y_train)

In [74]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, len(word2id)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/i, best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


0.708  0.644 0.644
0.472  0.644 0.712
0.329  0.644 0.744
0.243  0.588 0.588
0.147  0.561 0.561
0.077  0.554 0.554
0.049  0.554 0.580
0.038  0.554 0.558
0.027  0.554 0.578
17.04 sec


In [75]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.7875

In [76]:
valid_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.696875

In [77]:
def initialize_embs(embs, w2v, dictionary):
    e = np.array([w2v[w] for w in w2v.vocab])
    embeddings_mean = float(np.mean(e))
    embeddings_std = float(np.std(e))
    embs.weight.data.normal_(embeddings_mean, embeddings_std)
    for w in dictionary:
        embs.weight.data[dictionary[w]].copy_(
            torch.from_numpy(w2v[w]))

In [78]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, len(word2id)).to(device)
initialize_embs(mdl.emb, w2v, r_word2id)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/i, best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


0.698  0.643 0.643
0.538  0.567 0.567
0.364  0.485 0.485
0.199  0.424 0.424
0.095  0.393 0.393
0.050  0.393 0.408
0.032  0.393 0.405
0.021  0.393 0.396
0.011  0.380 0.380
0.008  0.379 0.379
0.006  0.378 0.378
0.005  0.378 0.379
0.004  0.378 0.382
0.003  0.378 0.386
25.79 sec


In [79]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.8475

In [80]:
valid_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.80625

In [81]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = AVGNet(300, len(word2id)).to(device)
initialize_embs(mdl.emb, w2v, r_word2id)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(mdl.parameters(), lr=0.01)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

AVGNet(
  (emb): Embedding(39698, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


2.617  0.670 0.670
2.292  0.644 0.644
1.658  0.575 0.575
1.036  0.495 0.495
0.620  0.397 0.397
0.350  0.397 0.406
0.307  0.397 0.411
0.146  0.366 0.366
0.129  0.364 0.364
0.108  0.364 0.389
0.119  0.364 0.453
0.123  0.364 0.592
13.84 sec


In [82]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.8575

In [83]:
valid_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.859375

In [85]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = AVGNet(300, len(word2id)).to(device)
initialize_embs(mdl.emb, w2v, r_word2id)
mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.1)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

AVGNet(
  (emb): Embedding(39698, 300)
  (out): Linear(in_features=300, out_features=1, bias=True)
)
3.878  0.920 0.920


  "type " + obj.__name__ + ". It won't be checked "


3.053  0.674 0.674
2.647  0.658 0.658
2.566  0.646 0.646
2.493  0.636 0.636
2.424  0.632 0.632
2.374  0.627 0.627
2.333  0.622 0.622
2.297  0.617 0.617
2.265  0.611 0.611
2.235  0.606 0.606
2.208  0.601 0.601
2.182  0.596 0.596
2.159  0.592 0.592
2.137  0.588 0.588
2.117  0.584 0.584
2.097  0.580 0.580
2.079  0.576 0.576
2.062  0.573 0.573
2.047  0.569 0.569
2.032  0.566 0.566
2.017  0.563 0.563
2.004  0.560 0.560
1.991  0.557 0.557
1.979  0.555 0.555
1.967  0.552 0.552
1.956  0.549 0.549
1.946  0.547 0.547
1.935  0.545 0.545
1.926  0.542 0.542
1.916  0.540 0.540
1.907  0.538 0.538
1.899  0.536 0.536
1.890  0.534 0.534
1.882  0.532 0.532
1.875  0.530 0.530
1.867  0.528 0.528
1.860  0.526 0.526
1.853  0.524 0.524
1.846  0.522 0.522
1.839  0.521 0.521
1.833  0.519 0.519
1.827  0.517 0.517
1.820  0.516 0.516
1.815  0.514 0.514
1.809  0.512 0.512
1.803  0.511 0.511
1.798  0.509 0.509
1.792  0.508 0.508
1.787  0.507 0.507
13.26 sec


In [86]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.7375

In [87]:
valid_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.734375

In [88]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, len(word2id)).to(device)
initialize_embs(mdl.emb, w2v, r_word2id)
mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
val_y = torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1)).to(device)


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.to(device))
        loss = criterion(out, y.to(device))
        running_loss += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.item() < best_val_loss:
        best_val_loss = val_loss.item()
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.item()))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(39698, 300)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


2.542  0.645 0.645
2.043  0.581 0.581
1.562  0.517 0.517
1.087  0.474 0.474
0.714  0.432 0.432
0.455  0.390 0.390
0.297  0.374 0.374
0.211  0.366 0.366
0.154  0.364 0.364
0.123  0.364 0.418
0.113  0.364 0.471
0.107  0.364 0.452
0.100  0.359 0.359
0.086  0.359 0.448
0.054  0.359 0.483
0.030  0.359 0.362
10.97 sec


In [89]:
test_X = torch.LongTensor(pad_batch(df_X_test)).to(device)
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.84

In [90]:
valid_X = torch.LongTensor(pad_batch(df_X_valid)).to(device)
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.to('cpu').data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.84375

In [66]:
class CNNNet(nn.Module):
    
    def __init__(self, emb_dim, vocab):
        super(CNNNet, self).__init__()
        self.emb = nn.Embedding(embedding_dim=emb_dim, num_embeddings=len(vocab),padding_idx=0)
        self.conv = nn.Conv1d(kernel_size=5, out_channels=100, in_channels=300)
        self.out = nn.Linear(100, 1)
        
    def forward(self, x):
        emb = self.emb(x)
        conv_out = self.conv(emb.transpose(1, 2))
        h1 = F.relu(F.max_pool1d(conv_out, conv_out.size(2))).squeeze(2)
        return F.sigmoid(self.out(h1))

In [67]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, w2idx).cuda()
initialize_embs(mdl.emb, w2v, restrict_w2id)
mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = Variable(torch.LongTensor(pad_batch(df_X_valid))).cuda()
val_y = Variable(torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1))).cuda()


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.cuda())
        loss = criterion(out, y.cuda())
        #if i%30 == 29:
        #    print('{:.3f}'.format(running_loss/30))
        #    running_loss = 0
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.data[0] < best_val_loss:
        best_val_loss = val_loss.data[0]
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.data[0]))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(40259, 300, padding_idx=0)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


2.566  0.658 0.658
2.122  0.604 0.604
1.673  0.540 0.540
1.202  0.486 0.486
0.810  0.436 0.436
0.531  0.393 0.393
0.350  0.372 0.372
0.245  0.365 0.365
0.174  0.354 0.354
0.132  0.354 0.366
0.112  0.354 0.400
0.099  0.354 0.407
10.44 sec


In [68]:
test_X = Variable(torch.LongTensor(pad_batch(df_X_test))).cuda()
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.8125

In [69]:
valid_X = Variable(torch.LongTensor(pad_batch(df_X_valid))).cuda()
mdl = torch.load('best.pt')
pred = mdl(valid_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_valid)

0.834375

In [70]:
class CNNNet(nn.Module):
    
    def __init__(self, emb_dim, vocab):
        super(CNNNet, self).__init__()
        self.emb = nn.Embedding(embedding_dim=emb_dim, num_embeddings=len(vocab),padding_idx=0)
        self.conv = nn.Conv1d(kernel_size=5, out_channels=100, in_channels=300)
        self.hidd = nn.Linear(100, 100)
        self.out = nn.Linear(100, 1)
        
    def forward(self, x):
        emb = self.emb(x)
        conv_out = self.conv(emb.transpose(1, 2))
        h1 = F.relu(F.max_pool1d(conv_out, conv_out.size(2))).squeeze(2)
        return F.sigmoid(self.out(F.relu(self.hidd(h1))))

In [71]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, w2idx).cuda()
initialize_embs(mdl.emb, w2v, restrict_w2id)
# mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.0001)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = Variable(torch.LongTensor(pad_batch(df_X_valid))).cuda()
val_y = Variable(torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1))).cuda()


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.cuda())
        loss = criterion(out, y.cuda())
        #if i%30 == 29:
        #    print('{:.3f}'.format(running_loss/30))
        #    running_loss = 0
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.data[0] < best_val_loss:
        best_val_loss = val_loss.data[0]
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.data[0]))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(40259, 300, padding_idx=0)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (hidd): Linear(in_features=100, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


2.587  0.692 0.692
2.544  0.691 0.691
2.506  0.689 0.689
2.461  0.687 0.687
2.403  0.684 0.684
2.330  0.680 0.680
2.236  0.675 0.675
2.109  0.668 0.668
1.945  0.658 0.658
1.740  0.646 0.646
1.503  0.630 0.630
1.253  0.614 0.614
1.012  0.598 0.598
0.797  0.583 0.583
0.617  0.569 0.569
0.475  0.558 0.558
0.367  0.548 0.548
0.286  0.541 0.541
0.226  0.534 0.534
0.181  0.529 0.529
0.148  0.524 0.524
0.122  0.519 0.519
0.102  0.515 0.515
0.087  0.512 0.512
0.075  0.509 0.509
0.065  0.507 0.507
0.056  0.506 0.506
0.050  0.506 0.506
0.044  0.505 0.505
0.039  0.505 0.505
0.035  0.504 0.504
0.031  0.503 0.503
0.028  0.502 0.502
0.025  0.501 0.501
0.023  0.500 0.500
0.020  0.500 0.500
0.019  0.499 0.499
0.017  0.499 0.499
0.015  0.498 0.498
0.014  0.498 0.498
0.013  0.498 0.498
0.012  0.498 0.498
0.011  0.497 0.497
0.010  0.497 0.497
0.010  0.497 0.497
0.009  0.497 0.497
0.008  0.497 0.498
62.06 sec


In [72]:
test_X = Variable(torch.LongTensor(pad_batch(df_X_test))).cuda()
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.75

In [73]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, w2idx)
initialize_embs(mdl.emb, w2v, restrict_w2id)
mdl = mdl.cuda()
# mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.0001, weight_decay=0.0005)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = Variable(torch.LongTensor(pad_batch(df_X_valid))).cuda()
val_y = Variable(torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1))).cuda()


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.cuda())
        loss = criterion(out, y.cuda())
        #if i%30 == 29:
        #    print('{:.3f}'.format(running_loss/30))
        #    running_loss = 0
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.data[0] < best_val_loss:
        best_val_loss = val_loss.data[0]
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.data[0]))
print('{:.2f} sec'.format(time.time()-start))

CNNNet(
  (emb): Embedding(40259, 300, padding_idx=0)
  (conv): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  (hidd): Linear(in_features=100, out_features=100, bias=True)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


  "type " + obj.__name__ + ". It won't be checked "


2.587  0.692 0.692
2.550  0.691 0.691
2.521  0.689 0.689
2.487  0.688 0.688
2.448  0.685 0.685
2.400  0.682 0.682
2.342  0.679 0.679
2.270  0.673 0.673
2.178  0.667 0.667
2.061  0.658 0.658
1.915  0.646 0.646
1.742  0.631 0.631
1.550  0.615 0.615
1.347  0.598 0.598
1.146  0.581 0.581
0.959  0.565 0.565
0.794  0.551 0.551
0.654  0.539 0.539
0.539  0.529 0.529
0.445  0.521 0.521
0.370  0.515 0.515
0.310  0.509 0.509
0.263  0.505 0.505
0.226  0.501 0.501
0.195  0.499 0.499
0.171  0.496 0.496
0.151  0.494 0.494
0.135  0.492 0.492
0.122  0.491 0.491
0.111  0.489 0.489
0.101  0.488 0.488
0.093  0.486 0.486
0.086  0.485 0.485
0.080  0.484 0.484
0.075  0.483 0.483
0.070  0.483 0.483
0.066  0.482 0.482
0.062  0.482 0.482
0.059  0.481 0.481
0.056  0.481 0.481
0.054  0.481 0.481
0.051  0.481 0.481
0.048  0.481 0.481
0.046  0.481 0.481
0.044  0.481 0.482
61.07 sec


In [74]:
test_X = Variable(torch.LongTensor(pad_batch(df_X_test))).cuda()
mdl = torch.load('best.pt')
pred = mdl(test_X)
out = pred.cpu().data.numpy()
accuracy_score(y_pred=(out >= 0.5), y_true=df_y_test)

0.7725

In [75]:
del mdl
torch.cuda.empty_cache()

In [76]:
import sys

# These are the usual ipython objects, including this one you are creating
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# Get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('dataset', 34740190),
 ('df_X_train', 8636936),
 ('df_X_test', 2634072),
 ('df_X_valid', 2096896),
 ('restrict_w2id', 1310816),
 ('w2idx', 1310816),
 ('df_y_test', 26904),
 ('df_y_train', 20504),
 ('df_y_valid', 15384),
 ('negative_reviews', 9024),
 ('positive_reviews', 9024),
 ('parsed_neg', 8544),
 ('parsed_pos', 8544),
 ('AVGNet', 1184),
 ('CNNNet', 1184),
 ('KeyedVectors', 1056),
 ('Variable', 1056),
 ('accuracy_score', 136),
 ('batched_dataset', 136),
 ('get_dataframe', 136),
 ('initialize_embs', 136),
 ('load_all', 136),
 ('lowercased', 136),
 ('pad_batch', 136),
 ('r_word2id', 136),
 ('train_test_split', 136),
 ('vocabulary', 136),
 ('word2id', 136),
 ('out', 112),
 ('F', 80),
 ('fake_sent', 80),
 ('loss', 80),
 ('nn', 80),
 ('np', 80),
 ('pred', 80),
 ('test_X', 80),
 ('val_X', 80),
 ('val_loss', 80),
 ('val_y', 80),
 ('valid_X', 80),
 ('x', 80),
 ('y', 80),
 ('sent', 69),
 ('criterion', 56),
 ('nlp', 56),
 ('optimizer', 56),
 ('w2v', 56),
 ('epoch', 28),
 ('i', 28),
 ('patie

In [None]:
import numpy as np
torch.manual_seed(42)
torch.cuda.manual_seed(42)


mdl = CNNNet(300, w2idx)
initialize_embs(mdl.emb, w2v, restrict_w2id)
mdl = mdl.cuda()
mdl.emb.weight.requires_grad = False
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, mdl.parameters()), lr=0.0001, weight_decay=0.0005)
print(mdl)

best_val_loss = float('inf')
patience = 2
wait = 0
val_X = Variable(torch.LongTensor(pad_batch(df_X_valid))).cuda()
val_y = Variable(torch.FloatTensor(np.expand_dims(df_y_valid.as_matrix(),1))).cuda()


import time 
start = time.time()
for epoch in range(50):
    running_loss = 0
    for i,(x, y) in enumerate(batched_dataset(df_X_train, df_y_train)):
        optimizer.zero_grad()
        out = mdl(x.cuda())
        loss = criterion(out, y.cuda())
        #if i%30 == 29:
        #    print('{:.3f}'.format(running_loss/30))
        #    running_loss = 0
        running_loss += loss.data[0]
        loss.backward()
        optimizer.step()
    val_loss = criterion(mdl(val_X), val_y)
    if val_loss.data[0] < best_val_loss:
        best_val_loss = val_loss.data[0]
        torch.save(mdl, 'best.pt')
        wait  = 0
    elif wait > patience:
        break
    else:
        wait+=1
        
    print('{:.3f}  {:.3f} {:.3f}'.format(running_loss/(i%30+1), best_val_loss, val_loss.data[0]))
print('{:.2f} sec'.format(time.time()-start))