# 2019320016 차주한


### Load dataset and split into train & validation

In [1]:
import torch
from torchtext.legacy import data
from torchtext.legacy.data import TabularDataset

mail_field = data.Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, batch_first=True, fix_length=300)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)
id_field = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)

loaded_data = TabularDataset(path="./data/train.csv",
                                 format="csv",
                                 fields=[("id" , id_field), ("label", label_field), ("mail", mail_field)],
                                 skip_header=True)

train_data, val_data = loaded_data.split(split_ratio=0.95, stratified=True, strata_field="label")

print("train data: ", len(train_data))
print(train_data.fields)
print("validation data: ", len(val_data))
print(val_data.fields)

train data:  3439
{'id': <torchtext.legacy.data.field.Field object at 0x000001689C0E1A00>, 'label': <torchtext.legacy.data.field.Field object at 0x000001689C0E28B0>, 'mail': <torchtext.legacy.data.field.Field object at 0x000001689C0E18E0>}
validation data:  181
{'id': <torchtext.legacy.data.field.Field object at 0x000001689C0E1A00>, 'label': <torchtext.legacy.data.field.Field object at 0x000001689C0E28B0>, 'mail': <torchtext.legacy.data.field.Field object at 0x000001689C0E18E0>}


### Build vocabulary dictionary

In [2]:
mail_field.build_vocab(train_data, min_freq=3)
print("size of vocab: ", len(mail_field.vocab))
print(mail_field.vocab.stoi)

size of vocab:  11819


### Make data loader

In [3]:
from torchtext.legacy.data import Iterator

batch_size = 32
train_loader = Iterator(dataset=train_data, batch_size=batch_size, device="cuda")
val_loader = Iterator(dataset=val_data, batch_size=len(val_data), device="cuda", shuffle=False)

### Make recurrent net model

In [4]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, n_layers, hidden_size, embedding_dim, dropout):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(mail_field.vocab), embedding_dim=embedding_dim, padding_idx=1)
        self.rnn = nn.GRU(num_layers=n_layers,input_size=embedding_dim ,hidden_size=hidden_size, batch_first=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        h, _ = self.rnn(embedded)
        h_t = h[:, -1, :]
        activation = self.relu(h_t)
        dropped = self.dropout(activation)
        logit = self.linear(dropped)
        return logit.squeeze()

### Train & evaluate function

In [5]:
from sklearn.metrics import f1_score

def train(model, loss_fn, optimizer, loader):
    model.train()
    for batch in loader:
        x, y = batch.mail, batch.label
        optimizer.zero_grad()

        logit = model(x)
        loss = loss_fn(logit, y.float())
        loss.backward()

        optimizer.step()

def evaluate(model, loss_fn, loader):
    model.eval()
    total_loss, corrects = 0, 0

    y_predict = torch.tensor([]).to("cuda")
    y_true = torch.tensor(()).to("cuda")

    for batch in loader:
        x,y = batch.mail, batch.label

        logit = model(x)
        loss = loss_fn(logit, y.float())
        total_loss += loss.item()

        predicted = (logit > 0) * 1.0
        corrects += (predicted == y).sum()

        y_predict = torch.cat((y_predict, predicted), 0)
        y_true = torch.cat((y_true, y), 0)

    score = f1_score(y_true.cpu(), y_predict.cpu())

    size = len(loader.dataset)
    avg_accuracy = corrects / size
    avg_loss = total_loss / size

    return avg_loss, avg_accuracy, score

### Load previous best model if exists

In [6]:
import copy

best_val_acc = 0
best_model = None

### Train model

In [7]:
model = MyModel(n_layers=1, hidden_size=128, embedding_dim=32, dropout=0.7).to("cuda")

lr = 5e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)

loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum").to("cuda")

for e in range(30):
    train(model, loss_fn, optimizer, train_loader)
    train_loss, train_accuracy, train_f1 = evaluate(model, loss_fn, train_loader)
    val_loss, val_accuracy, val_f1 = evaluate(model, loss_fn, val_loader)

    print("[Epoch: %2d] train loss: %5.4f | train accuracy: %5.4f | train f1: %5.2f | val loss: %5.4f | val accuracy: %5.4f | val f1: %5.2f" % (e + 1, train_loss, train_accuracy, train_f1, val_loss, val_accuracy, val_f1))

    if  val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        best_model = copy.deepcopy(model)

print("Best model accuracy: ", best_val_acc)

[Epoch:  1] train loss: 0.2272 | train accuracy: 0.9128 | train f1:  0.83 | val loss: 0.1942 | val accuracy: 0.9448 | val f1:  0.90
[Epoch:  2] train loss: 0.3655 | train accuracy: 0.8561 | train f1:  0.80 | val loss: 0.5264 | val accuracy: 0.8177 | val f1:  0.76
[Epoch:  3] train loss: 0.0312 | train accuracy: 0.9936 | train f1:  0.99 | val loss: 0.0347 | val accuracy: 0.9834 | val f1:  0.97
[Epoch:  4] train loss: 0.0148 | train accuracy: 0.9962 | train f1:  0.99 | val loss: 0.0500 | val accuracy: 0.9779 | val f1:  0.96
[Epoch:  5] train loss: 0.0104 | train accuracy: 0.9980 | train f1:  1.00 | val loss: 0.0288 | val accuracy: 0.9890 | val f1:  0.98
[Epoch:  6] train loss: 0.0104 | train accuracy: 0.9980 | train f1:  1.00 | val loss: 0.0335 | val accuracy: 0.9834 | val f1:  0.97
[Epoch:  7] train loss: 0.0059 | train accuracy: 0.9988 | train f1:  1.00 | val loss: 0.0322 | val accuracy: 0.9834 | val f1:  0.97
[Epoch:  8] train loss: 0.0047 | train accuracy: 0.9991 | train f1:  1.00 | 

### Predict on test data

In [8]:
test_data = TabularDataset(path="./data/test.csv",
                                 format="csv",
                                 fields=[("id" , id_field), ("mail", mail_field)],
                                 skip_header=True)

test_loader = Iterator(dataset=test_data, batch_size=len(test_data), device="cuda", shuffle=False)

best_model.eval()

res = None

for batch in test_loader:
    x = batch.mail
    id = batch.id

    print(id)

    logit = best_model(x)
    res = (logit > 0) * 1

print(res)

tensor([   0,    1,    2,  ..., 1548, 1549, 1550], device='cuda:0')


  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


tensor([0, 0, 0,  ..., 1, 0, 0], device='cuda:0')


In [9]:
import csv

f = open("result_rnn.csv", "w", newline="")
wr = csv.writer(f)
wr.writerow(["id", "label"])

id = 0
for i in res:
    wr.writerow([id, i.item()])
    id += 1

f.close()