In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchtext.data as data
import torchtext.datasets as datasets
import pickle

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [0]:
class CNN_Text(nn.Module):
    def __init__(self, embed_num, class_num):
        super(CNN_Text, self).__init__()
        V = embed_num
        C = class_num
        Co = 50  # args.kernel_num
        Ks = [2, 3, 4]

        self.embed = nn.Embedding(V, 100)
        self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, 100)) for K in Ks])
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(len(Ks) * Co, C)

    def forward(self, x):
        x = self.embed(x)  # (N, W, D)
        x = x.unsqueeze(1)  # (N, Ci, W, D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1]  # [(N, Co, W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [(N, Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x)  # (N, len(Ks)*Co)
        logit = self.fc1(x)  # (N, C)
        return logit

In [0]:
class mydataset(data.Dataset):
    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            for i, line in enumerate(open(path, 'r', encoding='utf-8')):
                if i == 0:
                    continue
                line = line.strip().split(',')
                if len(line) < 6: continue
                txt = line[5].split(' ')
                # txt= [ d.split('/')[0] for d in line[1].split(' ') ]

                examples += [data.Example.fromlist([txt, line[2]], fields)]
        super(mydataset, self).__init__(examples, fields, **kwargs)

In [9]:
text_field = data.Field(batch_first = True, fix_length = 20 ) # fix_length: 한 문장의 max 토큰수 길이
label_field = data.Field(sequential=False, batch_first = True, unk_token = None )
train_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/review_data/data_sentiment_train.csv')

#train_data = mydataset(text_field,label_field,path='small_ratings_train_tok.txt')
test_data = mydataset(text_field,label_field,path='/content/gdrive/My Drive/Colab Notebooks/review_data/balanced_test.csv')

text_field.build_vocab(train_data)
label_field.build_vocab(train_data)

train_iter, test_iter = data.Iterator.splits(
                            (train_data, test_data),
                            batch_sizes=(100, 1), repeat=False, device=device)
print(len(text_field.vocab))

795


In [10]:
cnn = CNN_Text(len(text_field.vocab),2).to(device)
optimizer = torch.optim.Adam(cnn.parameters())
print(cnn.train())

CNN_Text(
  (embed): Embedding(795, 100)
  (convs1): ModuleList(
    (0): Conv2d(1, 50, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 50, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 50, kernel_size=(4, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (fc1): Linear(in_features=150, out_features=2, bias=True)
)


In [11]:
%%time
for epoch in range(10):
    z = 0
    totalloss = 0
    for batch in train_iter:
        optimizer.zero_grad()

        txt = batch.text
        label = batch.label
        # print (txt.size())
        pred = cnn(txt)
        # print(pred.size(), label.size())
        # print(label)
        loss = F.cross_entropy(pred, label).to(device)
        totalloss += loss.data

        loss.backward()
        optimizer.step()
        # print(data,label)
    print(epoch, 'epoch')
    print(totalloss)

torch.save(cnn, '/content/gdrive/My Drive/Colab Notebooks/model/cnn_model.pt')

0 epoch
tensor(4.2365, device='cuda:0')
1 epoch
tensor(0.1582, device='cuda:0')
2 epoch
tensor(0.0597, device='cuda:0')
3 epoch
tensor(0.0330, device='cuda:0')
4 epoch
tensor(0.0207, device='cuda:0')
5 epoch
tensor(0.0149, device='cuda:0')
6 epoch
tensor(0.0106, device='cuda:0')
7 epoch
tensor(0.0080, device='cuda:0')
8 epoch
tensor(0.0066, device='cuda:0')
9 epoch
tensor(0.0052, device='cuda:0')
CPU times: user 16 s, sys: 7.78 s, total: 23.8 s
Wall time: 24.7 s


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [12]:
%%time
from sklearn.metrics import classification_report
with torch.no_grad():
  correct = 0
  incorrect = 0
  cnn.eval()
  y_test = []
  prediction = []

  for batch in test_iter:
    txt = batch.text
    label = batch.label
    y_test.append(label.data[0])

    pred = cnn(txt)
    _, ans = torch.max(pred, dim=1)
    prediction.append(ans.data[0])

    if ans.data[0] == label.data[0]:
      correct += 1
    else:
      incorrect += 1

  print('correct : ', correct)
  print('incorrect : ', incorrect)
  print(classification_report(torch.tensor(y_test), 
                              torch.tensor(prediction), 
                              digits=4, 
                              target_names=['negative', 'positive']))

correct :  92
incorrect :  82
              precision    recall  f1-score   support

    negative     0.5148    1.0000    0.6797        87
    positive     1.0000    0.0575    0.1087        87

    accuracy                         0.5287       174
   macro avg     0.7574    0.5287    0.3942       174
weighted avg     0.7574    0.5287    0.3942       174

CPU times: user 463 ms, sys: 27.8 ms, total: 491 ms
Wall time: 581 ms
