# 2019320016 차주한


### Load dataset and split into train & validation

In [49]:
import torch
from torchtext.legacy import data
from torchtext.legacy.data import TabularDataset

mail_field = data.Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, batch_first=True, fix_length=300)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)

loaded_data = TabularDataset(path="./data/train.csv",
                                 format="csv",
                                 fields=[(None , None), ("label", label_field), ("mail", mail_field)],
                                 skip_header=True)

train_data, val_data = loaded_data.split(split_ratio=0.8, stratified=True, strata_field="label")

print("train data: ", len(train_data))
print(train_data.fields)
print("validation data: ", len(val_data))
print(val_data.fields)

train data:  2896
{None: None, 'label': <torchtext.legacy.data.field.Field object at 0x000001EAA6779AC0>, 'mail': <torchtext.legacy.data.field.Field object at 0x000001EAA6779520>}
validation data:  724
{None: None, 'label': <torchtext.legacy.data.field.Field object at 0x000001EAA6779AC0>, 'mail': <torchtext.legacy.data.field.Field object at 0x000001EAA6779520>}


### Build vocabulary dictionary

In [50]:
mail_field.build_vocab(train_data)
print("size of vocab: ", len(mail_field.vocab))
print(mail_field.vocab.stoi)

size of vocab:  36539


### Make data loader

In [52]:
from torchtext.legacy.data import Iterator

batch_size = 32
train_loader = Iterator(dataset=train_data, batch_size=batch_size, device="cuda")
val_loader = Iterator(dataset=val_data, batch_size=batch_size, device="cuda")

### Make LSTM model

In [87]:
import torch.nn as nn
type = torch.cuda.FloatTensor

class LSTM(nn.Module):
    def __init__(self, n_layers, hidden_size, embedding_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(mail_field.vocab), embedding_dim=32, padding_idx=1)
        self.lstm = nn.LSTM(input_size=32, hidden_size=256, batch_first=True)
        self.linear = nn.Linear(256, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        h, _ = self.lstm(embedded)
        h_t = h[:, -1, :]
        output = self.linear(h_t)
        return output.squeeze()


### Train & evaluate function

In [94]:
def train(model, loss_fn, optimizer, loader):
    model.train()
    for batch in loader:
        x, y = batch.mail, batch.label
        optimizer.zero_grad()

        scores = model(x)
        loss = loss_fn(scores, y.float())
        loss.backward()

        optimizer.step()

def evaluate(model, loss_fn, loader):
    model.eval()
    corrects, total_loss = 0, 0

    for batch in loader:
        x,y = batch.mail, batch.label

        scores = model(x)
        loss = loss_fn(scores, y.float())
        total_loss += loss.item()

        predicted = (nn.Sigmoid()(scores) > 0.5) * 1.0

        corrects += (predicted == y.float()).sum()

    size = len(loader.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size

    return avg_loss, avg_accuracy

### Train model

In [99]:
import copy
best_val_acc = 0
best_model = None

model = LSTM(1, 256, 64).type(type)

lr = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum").type(type)

for e in range(10):
    train(model, loss_fn, optimizer, train_loader)
    train_loss, train_accuracy = evaluate(model, loss_fn, train_loader)
    val_loss, val_accuracy = evaluate(model, loss_fn, val_loader)

    print("[Epoch: %d] train loss: %5.4f | train accuracy: %5.2f | validation loss: %5.4f | validation accuracy: %5.2f" % (e + 1, train_loss, train_accuracy, val_loss, val_accuracy))

    if  val_accuracy > best_val_acc:
        best_val_acc_model = val_accuracy
        best_model = copy.deepcopy(model)

[Epoch: 1] train loss: 0.5733 | train accuracy: 71.65 | validation loss: 0.5952 | validation accuracy: 71.55
[Epoch: 2] train loss: 0.5777 | train accuracy: 71.51 | validation loss: 0.5944 | validation accuracy: 71.55
[Epoch: 3] train loss: 0.5328 | train accuracy: 73.48 | validation loss: 0.5993 | validation accuracy: 71.69
[Epoch: 4] train loss: 0.3320 | train accuracy: 81.77 | validation loss: 0.4109 | validation accuracy: 78.31
[Epoch: 5] train loss: 0.2210 | train accuracy: 90.47 | validation loss: 0.3383 | validation accuracy: 85.77
[Epoch: 6] train loss: 0.0485 | train accuracy: 98.24 | validation loss: 0.2112 | validation accuracy: 92.82
[Epoch: 7] train loss: 0.0185 | train accuracy: 99.55 | validation loss: 0.1441 | validation accuracy: 95.86
[Epoch: 8] train loss: 0.0082 | train accuracy: 99.86 | validation loss: 0.1400 | validation accuracy: 95.99
[Epoch: 9] train loss: 0.0063 | train accuracy: 99.79 | validation loss: 0.1485 | validation accuracy: 95.86
[Epoch: 10] train l

### Predict on test data

In [120]:
test_data = TabularDataset(path="./data/test.csv",
                                 format="csv",
                                 fields=[(None , None), ("mail", mail_field)],
                                 skip_header=True)

test_loader = Iterator(dataset=test_data, batch_size=batch_size, device="cuda")

res = torch.tensor([]).to("cuda")

model.eval()

for batch in test_loader:
    x = batch.mail

    scores = model(x)
    predicted = (nn.Sigmoid()(scores) > 0.5) * 1

    res = torch.cat((res, predicted), 0)

res = res.int().cpu()

In [125]:
import csv

print(res)

f = open("result_rnn.csv", "w", newline="")
wr = csv.writer(f)
wr.writerow(["id", "label"])

id = 0
for i in res:
    wr.writerow([id, i.item()])
    id += 1

f.close()

tensor([0, 0, 0,  ..., 0, 0, 0], dtype=torch.int32)
