In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from konlpy.tag import Mecab

import numpy as np



In [2]:
df_train = pd.read_csv("./train_for_korean.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./test_for_korean.csv", encoding="utf-8-sig")

In [3]:
df_train = df_train.dropna(axis=0)
space_idx = []
for i in range(len(df_train)):
    if str.isspace(df_train.iloc[i, 1]) == True:
        space_idx.append(i)
df_train = df_train.drop(df_train.index[[space_idx]])

  result = getitem(key)


In [4]:
df_test = df_test.dropna(axis=0)
space_idx = []
for i in range(len(df_test)):
    if str.isspace(df_test.iloc[i, 1]) == True:
        space_idx.append(i)
df_test = df_test.drop(df_test.index[[space_idx]])

  result = getitem(key)


In [5]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [6]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [7]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [8]:
y_train = y_train.astype(np.long)
y_val = y_val.astype(np.long)
y_test = y_test.astype(np.long)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until


In [9]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer(text):
    return m.morphs(text)

In [10]:
word2idx = {}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

count = 2

for i in range(len(X_train)):
    X_train[i] = tokenizer(X_train[i])
    for token in X_train[i]:
        if token not in word2idx.keys():
            word2idx[token] = count
            count += 1
    
for i in range(len(X_val)):
    X_val[i] = tokenizer(X_val[i])

for i in range(len(X_test)):
    X_test[i] = tokenizer(X_test[i])

In [11]:
idx2word = {y:x for x,y in word2idx.items()}

In [12]:
def sent2idx(data, word2idx):
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] in word2idx.keys():
                data[i][j] = word2idx[data[i][j]]
            else:
                data[i][j] = word2idx["UNK"]
    return data

In [13]:
X_train = sent2idx(X_train, word2idx)
X_val = sent2idx(X_val, word2idx)
X_test = sent2idx(X_test, word2idx)

In [14]:
input_file = "glove.txt"
output_file = "tmp.txt"

glove2word2vec(input_file, output_file)

glove = KeyedVectors.load_word2vec_format(output_file, binary=False)

  after removing the cwd from sys.path.


In [15]:
vocab_size = len(word2idx.keys())
embedding_size = 100
weight = np.zeros((vocab_size, embedding_size))
for i in range(2, vocab_size):
    if idx2word[i] in glove.key_to_index.keys():
        weight[i] = glove[idx2word[i]]

In [34]:
weight = torch.tensor(weight)
print(weight.shape)

torch.Size([45747, 100])


In [16]:
def make_tensor(data, word2idx):
    max_length = 0
    length_list = []
    
    for i in data:
        length_list.append(len(i))
        if len(i) > max_length:
            max_length = len(i)
            
    for i in data:
        for _ in range(max_length-len(i)):
            i.append(word2idx["PAD"])
    
    data = torch.tensor(data.tolist())
    
    return torch.tensor(data), length_list

In [17]:
X_train_idx, X_train_length = make_tensor(X_train, word2idx)
X_val_idx, X_val_length = make_tensor(X_val, word2idx)
X_test_idx, X_test_length = make_tensor(X_test, word2idx)

  app.launch_new_instance()


In [18]:
y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)
y_test = torch.tensor(y_test)

In [19]:
y_train = y_train.unsqueeze(1)
y_val = y_val.unsqueeze(1)
y_test = y_test.unsqueeze(1)

In [20]:
class CustomDataset(Dataset):
    def __init__(self, x_tensor, x_length, y_tensor):
        self.x = x_tensor
        self.l = x_length
        self.y = y_tensor

    def __getitem__(self, index):
        return (self.x[index], self.l[index], self.y[index])

    def __len__(self):
        return len(self.x)

In [21]:
trainset = CustomDataset(X_train_idx, X_train_length, y_train)
valset = CustomDataset(X_val_idx, X_val_length, y_val)
testset = CustomDataset(X_test_idx, X_test_length, y_test)

In [22]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [23]:
n_classes = 2

In [24]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [25]:
class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p = 0.2):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first= True,  bidirectional=True)
        self.out = nn.Linear(hidden_dim*2, n_classes, bias=True)

    def forward(self, x, length):
        embeded = self.embed(x)
        packed_input = pack_padded_sequence(embeded, length.tolist(), batch_first=True, enforce_sorted=False)
        packed_output,(hidden, cell) = self.lstm(packed_input)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        logit = self.out(hidden)
        return logit

In [36]:
model = LSTM(3, 256, vocab_size, 100, n_classes).to(DEVICE)
lr = 0.0001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [37]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , l, y = batch
        x = x.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x, l)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy 

In [38]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , l, y = batch
            x = x.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x, l)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [39]:
model.embed.weight.data.copy_(weight)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0527, -0.1771,  0.5961,  ..., -0.1474,  1.0186, -0.8482],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0792,  0.1054, -0.2989,  ...,  0.0477,  0.2381,  0.1649],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       device='cuda:0')

In [40]:
best_val_loss = None
n_epochs = 15
for epoch in range(n_epochs+1):
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    print("[Epoch: %d] val loss : %5.2f | val acuuracy : %5.2f" % (epoch, val_loss, val_accuracy))
    print("[Epoch: %d] train loss : %5.2f | train acuuracy : %5.2f" % (epoch, train_loss, train_accuracy))
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

[Epoch: 0] val loss :  0.38 | val acuuracy : 82.59
[Epoch: 0] train loss :  0.44 | train acuuracy : 79.58
[Epoch: 1] val loss :  0.35 | val acuuracy : 84.72
[Epoch: 1] train loss :  0.36 | train acuuracy : 83.77
[Epoch: 2] val loss :  0.33 | val acuuracy : 85.60
[Epoch: 2] train loss :  0.33 | train acuuracy : 85.43
[Epoch: 3] val loss :  0.32 | val acuuracy : 85.88
[Epoch: 3] train loss :  0.31 | train acuuracy : 86.67
[Epoch: 4] val loss :  0.31 | val acuuracy : 86.37
[Epoch: 4] train loss :  0.29 | train acuuracy : 87.83
[Epoch: 5] val loss :  0.32 | val acuuracy : 86.26
[Epoch: 5] train loss :  0.27 | train acuuracy : 88.77
[Epoch: 6] val loss :  0.32 | val acuuracy : 86.36
[Epoch: 6] train loss :  0.25 | train acuuracy : 89.67
[Epoch: 7] val loss :  0.33 | val acuuracy : 86.10
[Epoch: 7] train loss :  0.24 | train acuuracy : 90.45
[Epoch: 8] val loss :  0.32 | val acuuracy : 86.73
[Epoch: 8] train loss :  0.22 | train acuuracy : 91.13
[Epoch: 9] val loss :  0.34 | val acuuracy : 8

In [41]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [42]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(86.1101, device='cuda:0')
