# 2019320016 차주한


### Load dataset and split into train & validation

In [1]:
import torch
from torchtext.legacy import data
from torchtext.legacy.data import TabularDataset

mail_field = data.Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, batch_first=True, fix_length=300)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)
id_field = data.Field(sequential=False, use_vocab=False, batch_first=False, is_target=True)

loaded_data = TabularDataset(path="./data/train.csv",
                                 format="csv",
                                 fields=[("id" , id_field), ("label", label_field), ("mail", mail_field)],
                                 skip_header=True)

train_data, val_data = loaded_data.split(split_ratio=0.95, stratified=True, strata_field="label")

print("train data: ", len(train_data))
print(train_data.fields)
print("validation data: ", len(val_data))
print(val_data.fields)

train data:  3439
{'id': <torchtext.legacy.data.field.Field object at 0x00000254DFA0EA90>, 'label': <torchtext.legacy.data.field.Field object at 0x00000254DF9F5280>, 'mail': <torchtext.legacy.data.field.Field object at 0x00000254DFA33880>}
validation data:  181
{'id': <torchtext.legacy.data.field.Field object at 0x00000254DFA0EA90>, 'label': <torchtext.legacy.data.field.Field object at 0x00000254DF9F5280>, 'mail': <torchtext.legacy.data.field.Field object at 0x00000254DFA33880>}


### Build vocabulary dictionary

In [2]:
mail_field.build_vocab(train_data, min_freq=3)
print("size of vocab: ", len(mail_field.vocab))
print(mail_field.vocab.stoi)

size of vocab:  11746


### Make data loader

In [3]:
from torchtext.legacy.data import Iterator

batch_size = 32
train_loader = Iterator(dataset=train_data, batch_size=batch_size, device="cuda")
val_loader = Iterator(dataset=val_data, batch_size=len(val_data), device="cuda", shuffle=False)

### Make recurrent net model

In [4]:
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, n_layers, hidden_size, embedding_dim, dropout):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=len(mail_field.vocab), embedding_dim=embedding_dim, padding_idx=1)
        self.rnn = nn.GRU(num_layers=n_layers,input_size=embedding_dim ,hidden_size=hidden_size, batch_first=True)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        h, _ = self.rnn(embedded)
        h_t = h[:, -1, :]
        activation = self.relu(h_t)
        dropped = self.dropout(activation)
        logit = self.linear(dropped)
        return logit.squeeze()

### Train & evaluate function

In [5]:
from sklearn.metrics import f1_score

def train(model, loss_fn, optimizer, loader):
    model.train()
    for batch in loader:
        x, y = batch.mail, batch.label
        optimizer.zero_grad()

        logit = model(x)
        loss = loss_fn(logit, y.float())
        loss.backward()

        optimizer.step()

def evaluate(model, loss_fn, loader):
    model.eval()
    total_loss, corrects = 0, 0

    y_predict = torch.tensor([]).to("cuda")
    y_true = torch.tensor(()).to("cuda")

    for batch in loader:
        x,y = batch.mail, batch.label

        logit = model(x)
        loss = loss_fn(logit, y.float())
        total_loss += loss.item()

        predicted = (logit > 0) * 1.0
        corrects += (predicted == y).sum()

        y_predict = torch.cat((y_predict, predicted), 0)
        y_true = torch.cat((y_true, y), 0)

    score = f1_score(y_true.cpu(), y_predict.cpu())

    size = len(loader.dataset)
    avg_accuracy = corrects / size
    avg_loss = total_loss / size

    return avg_loss, avg_accuracy, score

In [6]:
import copy

best_val_acc = 0
best_model = None

### Train model

In [8]:
model = MyModel(n_layers=1, hidden_size=128, embedding_dim=32, dropout=0.7).to("cuda")

lr = 5e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)

loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum").to("cuda")

for e in range(30):
    train(model, loss_fn, optimizer, train_loader)
    train_loss, train_accuracy, train_f1 = evaluate(model, loss_fn, train_loader)
    val_loss, val_accuracy, val_f1 = evaluate(model, loss_fn, val_loader)

    print("[Epoch: %2d] train loss: %5.4f | train accuracy: %5.4f | train f1: %5.2f | val loss: %5.4f | val accuracy: %5.4f | val f1: %5.2f" % (e + 1, train_loss, train_accuracy, train_f1, val_loss, val_accuracy, val_f1))

    if  val_accuracy > best_val_acc:
        best_val_acc = val_accuracy
        best_model = copy.deepcopy(model)

print("Best model accuracy: ", best_val_acc)

[Epoch:  1] train loss: 0.4532 | train accuracy: 0.7502 | train f1:  0.22 | val loss: 0.4484 | val accuracy: 0.7514 | val f1:  0.26
[Epoch:  2] train loss: 0.3318 | train accuracy: 0.8779 | train f1:  0.76 | val loss: 0.3248 | val accuracy: 0.8785 | val f1:  0.80
[Epoch:  3] train loss: 0.1547 | train accuracy: 0.9471 | train f1:  0.91 | val loss: 0.2532 | val accuracy: 0.9116 | val f1:  0.87
[Epoch:  4] train loss: 0.0829 | train accuracy: 0.9782 | train f1:  0.96 | val loss: 0.1194 | val accuracy: 0.9669 | val f1:  0.95
[Epoch:  5] train loss: 0.0368 | train accuracy: 0.9919 | train f1:  0.99 | val loss: 0.0961 | val accuracy: 0.9558 | val f1:  0.93
[Epoch:  6] train loss: 0.0642 | train accuracy: 0.9802 | train f1:  0.96 | val loss: 0.0901 | val accuracy: 0.9779 | val f1:  0.96
[Epoch:  7] train loss: 0.0078 | train accuracy: 0.9985 | train f1:  1.00 | val loss: 0.0935 | val accuracy: 0.9724 | val f1:  0.95
[Epoch:  8] train loss: 0.0091 | train accuracy: 0.9985 | train f1:  1.00 | 

### Prepare datas for naive bayse

In [9]:
X_train = []
y_train = []
X_val = []
y_val = []

for i in train_data:
    X_train.append(" ".join(i.mail))
    y_train.append(i.label)

for i in val_data:
    X_val.append(" ".join(i.mail))
    y_val.append(i.label)

print(vars(train_data[0]))
print("label: ", y_train[0], ", mail: ", X_train[0])

print(vars(val_data[0]))
print("label: ", y_val[0], ", mail: ", X_val[0])

{'id': '2228', 'label': '1', 'mail': ['subject:', 're', ':', 'final', 'notice', '#', '7', 'v', '8477', 'hi', 'again', ',', 'i', 'sent', 'you', 'an', 'email', 'last', 'week', 'and', 'need', 'to', 'confirm', 'everything', 'now', '.', 'please', 'read', 'info', 'below', 'and', 'let', 'me', 'know', 'if', 'you', 'have', 'any', 'questions', '.', 'we', 'are', 'accepting', 'your', 'mo', 'rtgage', 'application', '.', 'if', 'you', 'have', 'poor', 'cr', '.', 'edit', ',', 'it', 'is', 'ok', '.', 'you', 'can', 'get', 'a', '$', '200', ',', '000', 'loa', 'n', 'for', '$', '650', '/', 'month', 'payment', '.', 'appr', 'oval', 'process', 'will', 'take', '1', 'minute', '.', 'just', 'visit', 'the', 'link', 'below', 'and', 'fill', 'short', 'form', '.', 'http', ':', '/', '/', 'gffefv', '.', 'net', '/', 'azwml', 'sincerely', ',', 'manager', ':', 'geoffrey', 'winston', 'heritage', 'financing']}
label:  1 , mail:  subject: re : final notice # 7 v 8477 hi again , i sent you an email last week and need to confirm e

### Preprocess data for naive bayse

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def tokenizer(text):
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hanchaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Make word of bag and transform to feature vector

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=tokenizer)
X_train_vectorized = vectorizer.fit_transform(X_train)
print("subject dtm shape: " + str(X_train_vectorized.shape))

subject dtm shape: (3439, 36408)


### Train Naive Bayse

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

NB_model = MultinomialNB(alpha=1)
NB_model.fit(X_train_vectorized, y_train)

predicted = NB_model.predict(X_train_vectorized)

print("Train accuracy: ", accuracy_score(y_train, predicted))

Train accuracy:  0.9851701075894155


### Test validation set on Naive Bayse

In [13]:
X_val_vectorized = vectorizer.transform(X_val)
predicted = NB_model.predict(X_val_vectorized)

print(accuracy_score(y_val, predicted))

0.9834254143646409


### Validation on ensemble of GRU and Naive Bayse

In [23]:
best_model.eval()

for batch in val_loader:
    x = batch.mail
    id = batch.id

    logit = best_model(x)
    gru_proba = nn.Sigmoid()(logit).detach().cpu().numpy()

NB_proba = NB_model.predict_proba(X_val_vectorized)[:, 1]

p = 0.5

ensembled_proba = p * NB_proba + (1 - p) * gru_proba
ensembled_predicted = ['0' if x == 0 else '1' for x in (ensembled_proba > 0.5) * 1]

print(y_val)
print(ensembled_predicted)

print(accuracy_score(y_val, ensembled_predicted))

['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1'

  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


### Predict on test data

In [24]:
test_data = TabularDataset(path="./data/test.csv",
                                 format="csv",
                                 fields=[("id" , id_field), ("mail", mail_field)],
                                 skip_header=True)

test_loader = Iterator(dataset=test_data, batch_size=len(test_data), device="cuda", shuffle=False)

best_model.eval()

for batch in test_loader:
    x = batch.mail
    id = batch.id

    print(id)

    logit = best_model(x)
    gru_proba = nn.Sigmoid()(logit).detach().cpu().numpy()

X_test = []

for i in test_data:
    X_test.append(" ".join(i.mail))

X_test_vectorized = vectorizer.transform(X_test)
NB_proba = NB_model.predict_proba(X_test_vectorized)[:, 1]

ensembled_proba = p * NB_proba + (1 - p) * gru_proba
ensembled_predicted = (ensembled_proba > 0.5) * 1

print(ensembled_predicted)

tensor([   0,    1,    2,  ..., 1548, 1549, 1550], device='cuda:0')


  result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,


[0 0 0 ... 1 0 0]


In [25]:
import csv

f = open("temp.csv", "w", newline="")
wr = csv.writer(f)
wr.writerow(["id", "label"])

id = 0
for i in ensembled_predicted:
    wr.writerow([id, i])
    id += 1

f.close()