In [81]:
max_length = 256
import pandas as pd

df = pd.read_csv("sms.tsv", sep='\t')
print(df.columns)
print(df.shape)
df.head()

Index(['label', 'sms'], dtype='object')
(5572, 2)


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [82]:
classes = sorted(set(df['label']))
class_to_idx = {c: i for i, c in enumerate(classes)}

nclass = len(classes)

print("# of classes: %d" % nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


In [83]:
new_df = pd.DataFrame({"label": df["label"], "sms": df["sms"].str.slice(start=0, stop=max_length)})

print(len(new_df))
new_df = pd.DataFrame(new_df.drop_duplicates())

print(len(new_df))

df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

5572
5169


Unnamed: 0,label,sms
0,ham,So lets make it saturday or monday as per conv...
1,ham,Lol u still feeling sick?
2,ham,Looks like u wil b getting a headstart im leav...
3,ham,Which channel:-):-):):-).
4,ham,We don call like &lt;#&gt; times oh. No give...


In [84]:
train_ratio = 0.9

s, e = 0, int(df_shuffled.shape[0] * train_ratio)
df_train = pd.DataFrame({"label": df_shuffled["label"][s:e], "sms":df_shuffled["sms"][s:e]})
print("index for train: %d~%d" % (s, e))

s, e = e, e + int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d ~ %d" % (s, e))
df_test = pd.DataFrame({"label": df_shuffled["label"][s:e], "sms":df_shuffled["sms"][s:e]})

index for train: 0~4652
index for test: 4652 ~ 5168


In [85]:
print(df_train.shape)
print(df_test.shape)

(4652, 2)
(516, 2)


In [86]:
df_train.to_csv("./sms.maxlen.uniq.shuf.train.tsv", header=False, index=False, sep='\t')
df_test.to_csv("./sms.maxlen.uniq.shuf.test.tsv", header=False, index=False, sep='\t')

In [87]:
import torch
print(torch.__version__)

1.4.0


In [88]:
!pip install torchtext==0.4.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [89]:
import torchtext
import numpy as np
from data_loader import DataLoader

import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

learning_rate = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [90]:
loaders = DataLoader(train_fn='./sms.maxlen.uniq.shuf.train.tsv', 
                    batch_size=batch_size,
                    valid_ratio=.2,
                    device=-1,
                    max_vocab=999999,
                    min_freq=5,
                    )

test_loader = DataLoader(train_fn='./sms.maxlen.uniq.shuf.test.tsv', 
                    batch_size=batch_size,
                    valid_ratio=.01,
                    device=-1,
                    max_vocab=999999,
                    min_freq=5,
                    )

In [91]:
print("|train| =", len(loaders.train_loader.dataset),
     "|valid| =", len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print("|vocab| =", vocab_size, "|classes| =", num_classes)

|train| = 3722 |valid| = 930
|vocab| = 1563 |classes| = 2


In [92]:
n = 3
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i > n:
        break
    
    print("[%d]" %i)
    print("한 번에 로드되는 데이터 크기:", len(labels))
    
    for j in range(n):
        label = labels[j].numpy()
        text = texts[j].numpy()
        print("label:", label)
        print("text:", text.shape)
        

[0]
한 번에 로드되는 데이터 크기: 128
label: 1
text: (35,)
label: 0
text: (35,)
label: 0
text: (35,)
[1]
한 번에 로드되는 데이터 크기: 128
label: 0
text: (9,)
label: 0
text: (9,)
label: 0
text: (9,)
[2]
한 번에 로드되는 데이터 크기: 128
label: 0
text: (7,)
label: 0
text: (7,)
label: 0
text: (7,)
[3]
한 번에 로드되는 데이터 크기: 128
label: 0
text: (18,)
label: 0
text: (18,)
label: 0
text: (18,)


In [93]:
class RNN(nn.Module):
    def __init__(self,  input_size, word_vec_size, hidden_size, n_classes, num_layers=4, dropout_p=0.3):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        self.emb = nn.Embedding(input_size, word_vec_size)
        self.lstm = nn.LSTM(input_size=word_vec_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           dropout=dropout_p,
                           batch_first=True,
                           bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, num_classes)
        
        self.activation = nn.LogSoftmax(dim=-1)
        
    def forward(self, x):
        x = self.emb(x)
        x, _ = self.lstm(x)
        out = self.activation(self.fc(x[:, -1]))
        
        return out
    



In [94]:
model = RNN(input_size=vocab_size,
            word_vec_size=word_vec_size,
            hidden_size=hidden_size,
            n_classes=num_classes,
            num_layers=num_layers,
            dropout_p=dropout_p
)

In [95]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0
    
    model.eval()
    
    for i, data in enumerate(dloader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        output = model(texts)
        _, output_index = torch.max(output, 1)
        
        total += labels.size(0)
        correct += (output_index == labels).sum().float()
    
    model.train()
    return (100 * correct / total).numpy()

In [96]:
print("Accuracy of Test Data: %.2f" % ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data: 11.29


In [97]:
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [100]:
total_step = len(loaders.train_loader)

for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts = data.text.to(device)
        labels = data.label.to(device)
        
        print("[%d]" % i)
        
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print("Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accr: {:.2f}"
                  .format(epoch+1, num_epochs, i+1, total_step,
                         loss.item(),
                         ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], Step [10/30], Loss: 0.0141, Accr: 96.67
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], Step [20/30], Loss: 0.0005, Accr: 96.67
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], Step [30/30], Loss: 0.0005, Accr: 96.56
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], Step [10/30], Loss: 0.0003, Accr: 96.45
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], Step [20/30], Loss: 0.0010, Accr: 96.67
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], Step [30/30], Loss: 0.0069, Accr: 96.77
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], Step [10/30], Loss: 0.0008, Accr: 96.77
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], Step [20/30], Loss: 0.0000, Accr: 96.67
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], Step [30/30], Loss: 0.0006, Accr: 96.67
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], Step [10/30], Loss: 0.0010, Accr: 96.77


In [None]:
print("Accuracy of Valid Date: %.2f" % ComputeAccr(loaders.valid_loader, model))
netname = "./rnn_weight.pkl"
torch.save(model, netname)

model = torch.load(netname)

print("Accuracy of Valid Date: %.2f" % ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Date: 96.77
