In [1]:
import os
import torch
import torch.nn as nn
import torchtext.data as data
import torchtext.datasets as datasets
import torch.nn.functional as F
from torchtext.vocab import Vectors 
from torchtext.data import TabularDataset

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import string
import re

from gensim.models import KeyedVectors 


## Pytorch

In [2]:
vectors = Vectors(name="./enwiki_20180420_win10_300d.txt")

In [3]:
def tokenizer(text):
    return word_tokenize(text)

In [4]:
TEXT = data.Field(sequential=True, tokenize=tokenizer, batch_first=True)
LABEL = data.Field(sequential=False, batch_first = True, is_target = True)

In [5]:
train_data_torchtext, test_data_torchtext = TabularDataset.splits(path =".", train = "./train_test_data/train_data_fin.csv", test="./train_test_data/test_data_fin.csv", 
                        format="csv", fields=[("text", TEXT), ("label", LABEL)], skip_header=True)

In [6]:
TEXT.build_vocab(train_data_torchtext, min_freq=5, vectors=vectors)
LABEL.build_vocab(train_data_torchtext)

In [7]:
print(len(TEXT.vocab.stoi))
print(len(LABEL.vocab.stoi))

2802
3


In [8]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)

cpu와 cuda 중 다음 기기로 학습함: cuda


In [9]:
train_data_torchtext, val_data_torchtext = train_data_torchtext.split(split_ratio=0.8)

In [10]:
train_iterator, val_iterator  = data.BucketIterator.splits((train_data_torchtext, val_data_torchtext),
                                                           batch_size = 64, shuffle = True, sort=False)
test_iterator = data.BucketIterator(test_data_torchtext, batch_size=len(test_data_torchtext), shuffle=False)

In [11]:
print("훈련 데이터의 미니 배치의 개수 :", len(train_iterator))
print("검증 데이터의 미니 배치의 개수 :", len(val_iterator))
print("테스트 데이터의 미니 배치의 개수 :", len(test_iterator))

훈련 데이터의 미니 배치의 개수 : 96
검증 데이터의 미니 배치의 개수 : 24
테스트 데이터의 미니 배치의 개수 : 1


In [12]:
vocab_size = len(TEXT.vocab)
n_classes = 2

In [13]:
class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, bias=True, batch_first=True, bidirectional=True)
        self.out = nn.Linear(hidden_dim*2, n_classes, bias=True)
        
    def forward(self, x):
        x = self.embed(x)
        x, _ = self.lstm(x)
        h_t = x[:, -1, :]
        self.dropout(h_t)
        logit = self.out(h_t)
        return logit 

In [14]:
model = LSTM(3, 300, vocab_size, 300, n_classes).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters())

In [15]:
sample = torch.LongTensor(5, 4).random_(0, 10).to(DEVICE)
sample_out = model(sample)
print(sample_out.shape)

torch.Size([5, 2])


In [16]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [17]:
def evaluate(model, val_iterator):
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iterator:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        
    size = len(val_iterator.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [18]:
pretrained_embeddings = TEXT.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([2802, 300])


In [19]:
model.embed.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0744, -0.1637,  0.0042,  ...,  0.1437, -0.0184, -0.0637],
        ...,
        [-0.2492,  0.1043,  0.0766,  ..., -0.2167, -0.1619, -0.2181],
        [-0.3468, -0.4618,  0.0253,  ...,  0.0488, -0.1717,  0.0123],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [21]:
best_val_loss = None
n_epochs = 100
for epoch in range(n_epochs + 1):
    train(model, optimizer, train_iterator)
    val_loss, val_accuracy = evaluate(model, val_iterator)
    
    print("[Epoch: %d] val loss : %5.2f | val accuracy : %5.2f" % (epoch, val_loss, val_accuracy))
    
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("BiLSTM_wiki300"):
            os.makedirs("BiLSTM_wiki300")
        torch.save(model.state_dict(), "./BiLSTM_wiki300/textclassificator.pt")
        best_val_loss = val_loss

[Epoch: 0] val loss :  0.40 | val accuracy : 83.78
[Epoch: 1] val loss :  0.39 | val accuracy : 85.36
[Epoch: 2] val loss :  0.41 | val accuracy : 84.44
[Epoch: 3] val loss :  0.45 | val accuracy : 83.59
[Epoch: 4] val loss :  0.48 | val accuracy : 81.22
[Epoch: 5] val loss :  0.52 | val accuracy : 82.40
[Epoch: 6] val loss :  0.65 | val accuracy : 81.16
[Epoch: 7] val loss :  0.58 | val accuracy : 81.48
[Epoch: 8] val loss :  0.67 | val accuracy : 80.63
[Epoch: 9] val loss :  0.63 | val accuracy : 80.70
[Epoch: 10] val loss :  0.71 | val accuracy : 78.07
[Epoch: 11] val loss :  0.66 | val accuracy : 79.05
[Epoch: 12] val loss :  0.57 | val accuracy : 77.48
[Epoch: 13] val loss :  0.82 | val accuracy : 77.81
[Epoch: 14] val loss :  0.83 | val accuracy : 78.46
[Epoch: 15] val loss :  0.93 | val accuracy : 79.05
[Epoch: 16] val loss :  1.04 | val accuracy : 76.95
[Epoch: 17] val loss :  0.92 | val accuracy : 78.07
[Epoch: 18] val loss :  0.96 | val accuracy : 78.53
[Epoch: 19] val loss :

In [22]:
model.load_state_dict(torch.load("./BiLSTM_wiki300/textclassificator.pt"))

<All keys matched successfully>

In [23]:
def test_result(model, test_iterator):
    model.eval()
    total_result = []
    for batch in test_iterator:
        x, y = batch.text.to(DEVICE), batch.label.to(DEVICE)
        y.data.sub_(1)
        logit = model(x)
        result = logit.max(1)[1]
        total_result += result
        
        
    return total_result

In [24]:
fin_result = test_result(model, test_iterator)
print(len(fin_result))
print(fin_result[:10])

3263
[tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0')]


In [25]:
print(fin_result[:10])

[tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0')]


In [26]:
final_result_csv = pd.DataFrame(fin_result, columns=["target"])
final_result_csv["target"] = final_result_csv["target"].apply(lambda x : x.item())
print(final_result_csv[:10])

   target
0       1
1       1
2       1
3       1
4       1
5       1
6       0
7       0
8       0
9       0


In [29]:
test = pd.read_csv("./train_test_data/test.csv")
test = test.drop(["keyword", "location", "text"], axis="columns")
test.head()

Unnamed: 0,id
0,0
1,2
2,3
3,9
4,11


In [30]:
final = pd.concat([test, final_result_csv], axis = 1)
final.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [31]:
final.to_csv("./result/result_BiLSTM_wiki300", index=False)