In [18]:
import os
import pandas as pd
import torch
import numpy as np
from torch import optim, nn
from pathlib import Path
from torchtext import data
from torchtext.vocab import Vectors
from konlpy.tag import Okt
from tqdm import tqdm
from torch.nn import functional as F
from sklearn.metrics import accuracy_score

# Trainer 

In [17]:
def train(model, train_loader, optimizer, criterion, device="cuda"):
    model.train()

    total = len(train_loader)
    train_correct = 0
    train_loss = 0

    for batch in tqdm(train_loader, total=total):
        optimizer.zero_grad()

        data = batch.text.to(device)
        labels = batch.label.to(device)

        pred = model(data).squeeze(1)
        pred = pred.detach().cpu().numpy()
        predicted = np.round(pred > 0.5)

        train_correct += (predicted == labels).sum().item()

        loss = criterion(pred.float(), labels.float())
        train_loss += loss.item()
        loss.backward()
        optimizer.step()

    return train_correct, train_loss


def valid(model, valid_loader, criterion, device="cuda"):
    model.eval()

    total = len(valid_loader)
    valid_correct = 0
    valid_loss = 0

    for batch in tqdm(valid_loader, total=total):
        data = batch.text.to(device)
        labels = batch.label.to(device)

        pred = model(data).squeeze(1)
        pred = pred.detach().cpu().numpy()
        predicted = np.round(pred > 0.5)

        valid_correct += (predicted == labels).sum().item()

        loss = criterion(pred.float(), labels.float())
        valid_loss += loss.item()

    return valid_correct, valid_loss


def test(model, test_loader, device="cuda"):
    model.eval()

    total = len(test_loader)

    total_pred = []

    for batch in tqdm(test_loader, total=total):
        data = batch.text.to(device)

        pred = model(data).squeeze(1)
        pred = pred.detach().cpu().numpy()
        predicted = np.round(pred > 0.5)[0]

        total_pred.append(predicted)

    return total_pred


# Load Data

In [2]:
DATA_PATH = Path("data")
w2v_file = "./data/glove.txt"

train_df = pd.read_csv(DATA_PATH / "news_train.csv")
test_df = pd.read_csv(DATA_PATH / "news_test.csv")
submission = pd.read_csv(DATA_PATH / "sample_submission.csv")

# Preprocessing

In [3]:
tokenizer = Okt().morphs

TEXT = data.Field(sequential=True,
                  tokenize=tokenizer, # 토크나이저로는 Okt 사용.
                  lower=True)

LABEL = data.Field(sequential=False,
                   use_vocab=False)

train_datafields = [('id', None), ('date', None), ('title', None), ('text', TEXT), ('ord', None), ('label', LABEL)]
test_datafields = [('id', None), ('date', None), ('title', None), ('text', TEXT), ('ord', None), ('label', None)]

train_examples = [data.Example.fromlist(i, train_datafields) for i in train_df.values.tolist()]
train_dataset = data.Dataset(train_examples, train_datafields)

test_examples = [data.Example.fromlist(i, test_datafields) for i in test_df.values.tolist()]
test_dataset = data.Dataset(test_examples, test_datafields)

TEXT.build_vocab(train_dataset, vectors=Vectors(w2v_file),)
word_embeddings = TEXT.vocab.vectors
vocab = TEXT.vocab

train_dataset, valid_dataset = train_dataset.split(split_ratio=0.8)

In [6]:
# hyper params
batch_size = 128
lr = 0.01
epochs = 10
embedding_dim = 100
vocab_size = len(vocab)
device = 'cuda'
save_path = 'model.pth'

In [7]:
train_iter = data.BucketIterator((train_dataset),
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.text),
                                repeat=False,
                                shuffle=True)
        
valid_iter = data.BucketIterator((valid_dataset),
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.text),
                                repeat=False,
                                shuffle=False)

test_iter = data.BucketIterator((test_dataset),
                                batch_size=1,
                                sort_key=lambda x: len(x.text),
                                repeat=False,
                                shuffle=False)

print("Loaded {} training examples".format(len(train_dataset)))
print("Loaded {} validation examples".format(len(valid_dataset)))
print("Loaded {} testing examples".format(len(test_dataset)))

In [19]:
class RCNN(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 vocab_size, 
                 num_layers = 1,
                 hidden_size=64, 
                 dropout=0.8,
                 word_embeddings=None):

        super(RCNN, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        if word_embeddings is not None:
            self.embeddings.weight = nn.Parameter(word_embeddings, requires_grad=False)

        self.lstm = nn.LSTM(input_size = embedding_dim,
                            hidden_size = hidden_size,
                            num_layers = num_layers,
                            dropout = dropout,
                            bidirectional = True)

        self.dropout = nn.Dropout(dropout)

        self.W = nn.Linear(embedding_dim + 2*hidden_size, 128)
        self.tanh = nn.Tanh()
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x.shape = (seq_len, batch_size)
        embedded_sent = self.embeddings(x)
        # embedded_sent.shape = (seq_len, batch_size, embed_size)

        lstm_out, (h_n,c_n) = self.lstm(embedded_sent)
        # lstm_out.shape = (seq_len, batch_size, 2 * hidden_size)
        
        input_features = torch.cat([lstm_out,embedded_sent], 2).permute(1,0,2)
        # final_features.shape = (batch_size, seq_len, embed_size + 2*hidden_size)
        
        linear_output = self.tanh(
            self.W(input_features)
        )
        # linear_output.shape = (batch_size, seq_len, hidden_size_linear)
        
        linear_output = linear_output.permute(0,2,1) # Reshaping fot max_pool
        
        max_out_features = F.max_pool1d(linear_output, linear_output.shape[2]).squeeze(2)
        # max_out_features.shape = (batch_size, hidden_size_linear)
        
        max_out_features = self.dropout(max_out_features)
        final_out = self.fc(max_out_features)
        
        return self.sigmoid(final_out)
        # return final_out

model = RCNN(embedding_dim=embedding_dim,
             vocab_size=vocab_size,
             word_embeddings=word_embeddings).to(device)

In [9]:
criterion = nn.BCELoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [30, 60, 90], gamma=0.1)

In [10]:
best_acc = 0
train_total = len(train_iter)
valid_total = len(valid_iter)

In [None]:
for e in range(0, epochs):
    train_correct, train_loss = train(model, train_iter, optimizer, criterion, device=device)
    train_acc = train_correct / (train_total * batch_size)
    train_loss = train_loss / (train_total * batch_size)

    valid_correct, valid_loss = valid(model, valid_iter, criterion, device=device)
    valid_acc = valid_correct / (valid_total * batch_size)
    valid_loss = valid_loss / (valid_total * batch_size)

    scheduler.step()

    print(f"[EPOCH : {epochs} / {e}] || [TRAIN ACC : {train_acc}] || [TRAIN LOSS : {train_loss}]"
            f"|| [VALID ACC : {valid_acc}] || [VALID LOSS : {valid_loss}]")

    if best_acc < valid_acc:
        torch.save({'epoch': e,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()},
                    save_path)
        best_acc = valid_acc

In [15]:
model.load_state_dict(torch.load(save_path)['model_state_dict'])
pred = test(model, test_iter, device=device)

100%|█████████████████████████████████████████████████████████████████████████| 142565/142565 [05:42<00:00, 416.54it/s]


In [16]:
submission['info'] = pred
submission.to_csv('starter.csv', index = False)