In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchtext.data as data

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [0]:
class RNN_Text(nn.Module):
    def __init__(self, embed_num, class_num):
        super(RNN_Text, self).__init__()
        V = embed_num
        C = class_num
        H = 256

        self.embed = nn.Embedding(V, 100)
        self.rnn = nn.LSTM(100, H, bidirectional=True)
        self.out = nn.Linear(H * 2, C)

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        ##x = x.unsqueeze(1)  # (N, Ci, W, D)
        x, (_, __) = self.rnn(x, (self.h, self.c))

        logit = self.out(x[-1])
        return logit

    def init_hidden(self, b):
        self.h = Variable(torch.randn(2, b, 256))
        self.c = Variable(torch.randn(2, b, 256))

In [0]:
class mydataset(data.Dataset):
    @staticmethod
    def sort_key(ex):
        return len(ex.text)
    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            for i,line in enumerate(open(path,'r',encoding='utf-8')):
                if i==0:
                    continue
                line = line.strip().split(',')
                if len(line) < 6: continue
                txt = line[5].split(' ')
                #txt= [ d.split('/')[0] for d in line[1].split(' ') ]
                examples += [ data.Example.fromlist( [txt, line[2]],fields ) ]
        super(mydataset, self).__init__(examples, fields, **kwargs)

In [5]:
text_field = data.Field(fix_length=20)
#text_field = data.Field()
label_field = data.Field(sequential=False, batch_first = True, unk_token = None)

train_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/review_data/data_sentiment_train.csv')

# test_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/review_data/data_sentiment_test.csv')
test_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/review_data/balanced_test.csv')

text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iter, test_iter = data.Iterator.splits(
                            (train_data, test_data),
                            batch_sizes=(100, 1), repeat=False)# device=device)
print(len(text_field.vocab))

795


In [6]:
rnn = RNN_Text(len(text_field.vocab),2)#.to(device)
optimizer = torch.optim.Adam(rnn.parameters())
print(rnn.train())

RNN_Text(
  (embed): Embedding(795, 100)
  (rnn): LSTM(100, 256, bidirectional=True)
  (out): Linear(in_features=512, out_features=2, bias=True)
)


In [7]:
%%time
for epoch in range(15):
    z = 0
    totalloss = 0
    for batch in train_iter:
        optimizer.zero_grad()

        txt = batch.text
        label = batch.label
        # print (txt.size())
        rnn.init_hidden(txt.size(1))

        pred = rnn(txt)
        # print(pred.size(), label.size())
        # print(label)
        loss = F.cross_entropy(pred, label)
        totalloss += loss.data

        loss.backward()
        optimizer.step()
        # print(data,label)

    print(epoch, 'epoch')
    print(totalloss)

torch.save(rnn, '/content/gdrive/My Drive/Colab Notebooks/model/rnn_model.pt')

0 epoch
tensor(21.4718)
1 epoch
tensor(6.4643)
2 epoch
tensor(0.0706)
3 epoch
tensor(0.0265)
4 epoch
tensor(0.0158)
5 epoch
tensor(0.0107)
6 epoch
tensor(0.0073)
7 epoch
tensor(0.0058)
8 epoch
tensor(0.0045)
9 epoch
tensor(0.3254)
10 epoch
tensor(0.1176)
11 epoch
tensor(0.0099)
12 epoch
tensor(0.0064)
13 epoch
tensor(0.0047)
14 epoch
tensor(0.0037)
CPU times: user 5min 59s, sys: 5.97 s, total: 6min 5s
Wall time: 6min 7s


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [9]:
%%time
from sklearn.metrics import classification_report
with torch.no_grad():
  correct = 0
  incorrect = 0
  rnn.eval()
  y_test = []
  prediction = []

  for batch in test_iter:
    txt = batch.text
    label = batch.label
    y_test.append(label.data[0])

    rnn.init_hidden(txt.size(1))

    pred = rnn(txt)
    _, ans = torch.max(pred, dim=1)
    prediction.append(ans.data[0])

    if ans.data[0] == label.data[0]:
        correct += 1
    else:
        incorrect += 1

  print('correct : ', correct)
  print('incorrect : ', incorrect)
  print(classification_report(torch.tensor(y_test), 
                              torch.tensor(prediction), 
                              digits=4, 
                              target_names=['negative', 'positive']))

correct :  92
incorrect :  82
              precision    recall  f1-score   support

    negative     0.5148    1.0000    0.6797        87
    positive     1.0000    0.0575    0.1087        87

    accuracy                         0.5287       174
   macro avg     0.7574    0.5287    0.3942       174
weighted avg     0.7574    0.5287    0.3942       174

CPU times: user 803 ms, sys: 1.01 ms, total: 804 ms
Wall time: 818 ms
