In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

import pandas as pd

from collections import Counter
from konlpy.tag import Mecab

import time

import numpy as np

In [2]:
import warnings 
warnings.simplefilter('ignore')

In [3]:
df_train = pd.read_csv("./naver_train.csv", encoding="utf-8-sig")
df_test = pd.read_csv("./naver_test.csv", encoding="utf-8-sig")

In [4]:
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙 진짜 짜증나네요 목소리,0
1,3819312,흠포스터 보고 초딩 영화 줄오버 연기조차 가볍지 않구나,1
2,10265843,너무 재밓었다 그래서 보는 것을 추천한다,0
3,9045019,교도소 이야기구먼 솔직히 재미는 없다 평점 조정,0
4,6483659,사이몬 페그의 익살스런 연기가 돋보였던 영화 스파이더맨에서 늙어 보이기만 했던 커스...,1


In [5]:
def delete_null(data):
    data = data.dropna(axis=0).reset_index(drop=True)
    space_idx = []
    for i in range(len(data)):
        if str.isspace(data.iloc[i, 1]) == True:
            space_idx.append(i)
    data = data.drop(space_idx)
    
    return data

In [6]:
df_train, df_test = map(delete_null, [df_train, df_test])

In [7]:
trainset = np.array(df_train.drop(["id"], axis = 1))
testset = np.array(df_test.drop(["id"], axis = 1))

In [8]:
trainset, valset= train_test_split(trainset, test_size=0.1)

In [9]:
X_train = trainset[:, 0]
y_train = trainset[:, 1]
X_val = valset[:, 0]
y_val = valset[:, 1]
X_test = testset[:, 0]
y_test = testset[:, 1]

In [10]:
y_train = y_train.astype(np.int64)
y_val = y_val.astype(np.int64)
y_test = y_test.astype(np.int64)

In [11]:
m = Mecab("C:\mecab\mecab-ko-dic")

def tokenizer(text):
    return m.morphs(text)

In [12]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer, max_features=1000)
X_train = vectorizer.fit_transform(X_train).toarray()
X_val = vectorizer.transform(X_val).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [13]:
print(X_train.shape)

(134681, 1000)


In [14]:
class CustomDataset(Dataset):
    def __init__(self, x, y,):
        self.x = x
        self.y = y
        
    def __getitem__(self, index):
        return (torch.FloatTensor(self.x[index]), self.y[index])
    
    def __len__(self):
        return len(self.x)

In [15]:
trainset = CustomDataset(X_train, y_train)
valset = CustomDataset(X_val, y_val)
testset = CustomDataset(X_test, y_test)

In [16]:
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
valloader = DataLoader(valset, batch_size=64, shuffle=True)
testloader = DataLoader(testset, batch_size=64, shuffle=True)

In [17]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu 와 cuda 중 다음 기기로 학슴함: ", DEVICE)

cpu 와 cuda 중 다음 기기로 학슴함:  cuda


In [18]:
class MLP(nn.Module):
    def __init__(self, input_size, n_classes):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 500)
        self.fc2 = nn.Linear(500, 250)
        self.fc3 = nn.Linear(250, 125)
        self.fc4 = nn.Linear(125, 2)

    def forward(self, x):
        logit = self.fc1(x)
        logit = self.fc2(logit)
        logit = self.fc3(logit)
        logit = self.fc4(logit)
        
        return logit

In [19]:
n_classes = 2
model = MLP(1000, n_classes).to(DEVICE)
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [20]:
def train(model, optimizer, train_iter):
    model.train()
    corrects, total_loss = 0, 0
    size = 0
    for b, batch in enumerate(train_iter):
        x , y = batch
        x = x.to(DEVICE)
        y = y.long().to(DEVICE)
        y = y.reshape(-1)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction="sum")
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
        size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy 

In [21]:
def evaluate(model, val_iter):
    model.eval()
    corrects, total_loss = 0, 0
    size = 0
    with torch.no_grad():
        for batch in val_iter:
            x , y = batch
            x = x.to(DEVICE)
            y = y.long().to(DEVICE)
            y = y.reshape(-1)
            logit = model(x)
            loss = F.cross_entropy(logit, y, reduction="sum")
            total_loss += loss.item()
            corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()    
            size += x.shape[0]
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [22]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [23]:
best_val_loss = None
n_epochs = 15
for epoch in range(n_epochs):
    
    start_time = time.time()
    
    train_loss, train_accuracy = train(model, optimizer, trainloader)
    val_loss, val_accuracy = evaluate(model, valloader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_accuracy:.2f}%')
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), "./textclassificatior.pt")
        best_val_loss = val_loss

Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.423 | Train Acc: 79.98%
	 Val. Loss: 0.406 |  Val. Acc: 81.26%
Epoch: 02 | Epoch Time: 0m 3s
	Train Loss: 0.410 | Train Acc: 80.94%
	 Val. Loss: 0.407 |  Val. Acc: 81.30%
Epoch: 03 | Epoch Time: 0m 3s
	Train Loss: 0.407 | Train Acc: 81.12%
	 Val. Loss: 0.405 |  Val. Acc: 81.31%
Epoch: 04 | Epoch Time: 0m 3s
	Train Loss: 0.406 | Train Acc: 81.15%
	 Val. Loss: 0.411 |  Val. Acc: 81.51%
Epoch: 05 | Epoch Time: 0m 3s
	Train Loss: 0.405 | Train Acc: 81.21%
	 Val. Loss: 0.403 |  Val. Acc: 81.68%
Epoch: 06 | Epoch Time: 0m 3s
	Train Loss: 0.404 | Train Acc: 81.32%
	 Val. Loss: 0.405 |  Val. Acc: 81.42%
Epoch: 07 | Epoch Time: 0m 3s
	Train Loss: 0.404 | Train Acc: 81.16%
	 Val. Loss: 0.403 |  Val. Acc: 81.42%
Epoch: 08 | Epoch Time: 0m 3s
	Train Loss: 0.403 | Train Acc: 81.24%
	 Val. Loss: 0.402 |  Val. Acc: 81.74%
Epoch: 09 | Epoch Time: 0m 3s
	Train Loss: 0.403 | Train Acc: 81.31%
	 Val. Loss: 0.403 |  Val. Acc: 81.52%
Epoch: 10 | Epoch Time: 0m 3

In [24]:
model.load_state_dict(torch.load("./textclassificatior.pt"))

<All keys matched successfully>

In [25]:
test_loss, test_accuracy = evaluate(model, testloader)
print(test_accuracy)

tensor(81.2108, device='cuda:0')
