In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-glove-pickle3/imdb_glove.pickle3
/kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/sampleSubmission.csv
/kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
/kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip


In [2]:
import logging
import os
import sys
import pickle
import time

import pandas as pd
import torch
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.autograd import Variable
from tqdm import tqdm


from sklearn.metrics import accuracy_score

In [3]:
! unzip /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
! unzip /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip

Archive:  /kaggle/input/word2vec-nlp-tutorial/labeledTrainData.tsv.zip
  inflating: labeledTrainData.tsv    
Archive:  /kaggle/input/word2vec-nlp-tutorial/testData.tsv.zip
  inflating: testData.tsv            
Archive:  /kaggle/input/word2vec-nlp-tutorial/unlabeledTrainData.tsv.zip
  inflating: unlabeledTrainData.tsv  


In [4]:
test = pd.read_csv("/kaggle/working/testData.tsv", header=0, delimiter="\t", quoting=3)

In [5]:
num_epochs = 10
embed_size = 300
num_filter = 128
filter_size = 3
bidirectional = True
batch_size = 64
labels = 2
lr = 0.1
device = torch.device('cuda:0')
use_gpu = True

In [6]:
class SentimentNet(nn.Module):
    def __init__(self, embed_size, num_filter, filter_size, weight, labels, use_gpu, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)
        self.use_gpu = use_gpu
        self.embedding = nn.Embedding.from_pretrained(weight)
        self.embedding.weight.requires_grad = False

        self.conv1d = nn.Conv1d(embed_size, num_filter, filter_size, padding=1)
        self.activate = F.relu
        self.decoder = nn.Linear(num_filter, labels)


    def forward(self, inputs):
        embeddings = self.embedding(inputs)

        convolution = self.activate(self.conv1d(embeddings.permute([0, 2, 1])))
        pooling = F.max_pool1d(convolution, kernel_size=convolution.shape[2])

        outputs = self.decoder(pooling.squeeze(dim=2))
        # print(outputs)
        return outputs



In [7]:
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

In [8]:
    logging.info('loading data...')
    pickle_file = os.path.join('/kaggle/input/imdb-glove-pickle3/imdb_glove.pickle3')
    [train_features, train_labels, val_features, val_labels, test_features, weight, word_to_idx, idx_to_word,
            vocab] = pickle.load(open(pickle_file, 'rb'))
    logging.info('data loaded!')

  return torch.load(io.BytesIO(b))


In [9]:
    net = SentimentNet(embed_size=embed_size, num_filter=num_filter, filter_size=filter_size,
                       weight=weight, labels=labels, use_gpu=use_gpu)
    net.to(device)
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=lr)

In [10]:
    train_set = torch.utils.data.TensorDataset(train_features, train_labels)
    val_set = torch.utils.data.TensorDataset(val_features, val_labels)
    test_set = torch.utils.data.TensorDataset(test_features, )

    train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_iter = torch.utils.data.DataLoader(val_set, batch_size=batch_size, shuffle=False)
    test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=False)

In [11]:
    for epoch in range(num_epochs):
        start = time.time()
        train_loss, val_losses = 0, 0
        train_acc, val_acc = 0, 0
        n, m = 0, 0
        with tqdm(total=len(train_iter), desc='Epoch %d' % epoch) as pbar:
            for feature, label in train_iter:
                n += 1
                net.zero_grad()
                feature = Variable(feature.cuda())
                label = Variable(label.cuda())
                score = net(feature)
                loss = loss_function(score, label)
                loss.backward()
                optimizer.step()
                train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                         dim=1), label.cpu())
                train_loss += loss

                pbar.set_postfix({'epoch': '%d' % (epoch),
                                  'train loss': '%.4f' % (train_loss.data / n),
                                  'train acc': '%.2f' % (train_acc / n)
                                  })
                pbar.update(1)

            with torch.no_grad():
                for val_feature, val_label in val_iter:
                    m += 1
                    val_feature = val_feature.cuda()
                    val_label = val_label.cuda()
                    val_score = net(val_feature)
                    val_loss = loss_function(val_score, val_label)
                    val_acc += accuracy_score(torch.argmax(val_score.cpu().data, dim=1), val_label.cpu())
                    val_losses += val_loss
            end = time.time()
            runtime = end - start
            pbar.set_postfix({'epoch': '%d' % (epoch),
                              'train loss': '%.4f' % (train_loss.data / n),
                              'train acc': '%.2f' % (train_acc / n),
                              'val loss': '%.4f' % (val_losses.data / m),
                              'val acc': '%.2f' % (val_acc / m),
                              'time': '%.2f' % (runtime)})

            # tqdm.write('{epoch: %d, train loss: %.4f, train acc: %.2f, val loss: %.4f, val acc: %.2f, time: %.2f}' %
            #       (epoch, train_loss.data / n, train_acc / n, val_losses.data / m, val_acc / m, runtime))



Epoch 0: 100%|██████████| 313/313 [00:03<00:00, 88.90it/s, epoch=0, train loss=0.5208, train acc=0.73, val loss=0.3818, val acc=0.83, time=3.52] 
Epoch 1: 100%|██████████| 313/313 [00:02<00:00, 118.11it/s, epoch=1, train loss=0.3565, train acc=0.85, val loss=0.3158, val acc=0.87, time=2.65]
Epoch 2: 100%|██████████| 313/313 [00:02<00:00, 121.72it/s, epoch=2, train loss=0.3193, train acc=0.86, val loss=0.3729, val acc=0.84, time=2.57]
Epoch 3: 100%|██████████| 313/313 [00:02<00:00, 120.15it/s, epoch=3, train loss=0.2871, train acc=0.88, val loss=0.4225, val acc=0.82, time=2.61]
Epoch 4: 100%|██████████| 313/313 [00:02<00:00, 120.76it/s, epoch=4, train loss=0.2552, train acc=0.89, val loss=0.3454, val acc=0.86, time=2.60]
Epoch 5: 100%|██████████| 313/313 [00:02<00:00, 119.72it/s, epoch=5, train loss=0.2297, train acc=0.91, val loss=0.3147, val acc=0.87, time=2.62]
Epoch 6: 100%|██████████| 313/313 [00:02<00:00, 121.22it/s, epoch=6, train loss=0.2447, train acc=0.91, val loss=0.6414, val

In [12]:
    test_pred = []
    with torch.no_grad():
        with tqdm(total=len(test_iter), desc='Prediction') as pbar:
            for test_feature, in test_iter:
                test_feature = test_feature.cuda()
                test_score = net(test_feature)
                # test_pred.extent
                test_pred.extend(torch.argmax(test_score.cpu().data, dim=1).numpy().tolist())

                pbar.update(1)

Prediction: 100%|██████████| 391/391 [00:01<00:00, 335.83it/s]


In [13]:
    result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred})
    result_output.to_csv("cnn.csv", index=False, quoting=3)
    logging.info('result saved!')