In [1]:
max_length = 256 # sms 최대 길이

# 1. 데이터 불러오기

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('sms.tsv', sep='\t',)
print(df.columns)
print(df.shape)

Index(['label', 'sms'], dtype='object')
(5572, 2)


In [4]:
df.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# 클래스 파악
classes = sorted(set(df['label'])) # 정렬하여 클래스에 넣기
class_to_idx = {}

for i, c in enumerate(classes):
    class_to_idx.update({c:i}) # 딕셔너리
    
nclass = len(classes)

print("# of classes: %d"%nclass)
print(classes)
print(class_to_idx)

# of classes: 2
['ham', 'spam']
{'ham': 0, 'spam': 1}


# 2. 새로운 Data Frame
## 1) 'label, sms' 만 남기기
## 2) 최대 텍스트 길이 만큼 자르기 
 - 성별, 가사만 남기려면?


In [6]:
new_df = pd.DataFrame({'label': df['label'], 
                      'sms':df['sms'].str.slice(# 최대 가사 텍스트 만큼 자르기
                      start = 0, stop=max_length)})
# sms 데이터에는 필요 없음

## 3) 중복 제거

In [7]:
len(new_df)

5572

In [8]:
new_df = pd.DataFrame(new_df.drop_duplicates())

In [9]:
len(new_df)

5169

## 4) 셔플

In [10]:
df_shuffled = new_df.sample(frac=1).reset_index(drop=True)
df_shuffled.head()

Unnamed: 0,label,sms
0,ham,Good. Good job. I like entrepreneurs
1,ham,Omg it could snow here tonite!
2,ham,I thk 530 lor. But dunno can get tickets a not...
3,ham,"Yeah we wouldn't leave for an hour at least, h..."
4,ham,Aiyo... U always c our ex one... I dunno abt m...


## 5) train, test 나누기

In [11]:
# train: test = 9:1
# train: test = 540:60 -> train: valid:test = 432:108:60
train_ratio = 0.9

# train dataset
s,e = 0, int(df_shuffled.shape[0] * train_ratio) #train 데이터의 start, end 포인트 정의
df_train = pd.DataFrame({'label': df_shuffled['label'][s:e],
                         'sms':df_shuffled['sms'][s:e]})
print("index for train : %d~%d"%(s,e))

# test dataset
s, e = e, e+int(df_shuffled.shape[0] * (1.0 - train_ratio))
print("index for test: %d~%d"%(s,e))
df_test = pd.DataFrame({'label': df_shuffled['label'][s:e],
                       'sms': df_shuffled['sms'][s:e]})

index for train : 0~4652
index for test: 4652~5168


## 6) 저장

In [12]:
df_train.to_csv("./sms.maxlen.uniq.shuf.train.tsv",
               header = False, index = False, sep='\t')
df_test.to_csv("./sms.maxlen.uniq.shuf.test.tsv",
               header = False, index = False, sep='\t')


# RNN + SMS 구현

In [None]:
# !pip install torch

## 0.1 라이브러리 임포트

In [13]:
## 0.1 라이브러리 임포트
import torch
import torch.nn as nn
import torchvision.datasets as dset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from torch.autograd import Variable
import numpy as np

## 0.2 하이퍼파라미터 셋팅


In [14]:
# hyper-parameters
batch_size = 128
num_epochs = 10

word_vec_size = 256
dropout_p = 0.3

hidden_size = 512
num_layers = 4

## yhk 추가
learning_rate = 0.001 # 디폴트 0.001

In [15]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


## 1. SMS train, test dataset 가져오기

In [16]:
from data_loader import DataLoader

In [17]:
loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.train.tsv',
    batch_size = batch_size,
    valid_ratio = .2,
    device = -1, # cpu
    max_vocab = 999999, # 최대한 크게 설정
    min_freq=5, # time step 갯수, 단어가 5개 초과되는 문장만 보겠다
)

In [18]:
test_loaders = DataLoader(
    train_fn = './sms.maxlen.uniq.shuf.test.tsv',
    batch_size = batch_size,
    valid_ratio = .01, # 0 으로 하고 싶으셨음
    device = -1,
    max_vocab = 999999,
    min_freq = 5
)

## 2. 대략적인 데이터 형태

In [19]:
print("|train| = ", len(loaders.train_loader.dataset),
     "|valid| = ", len(loaders.valid_loader.dataset))

vocab_size = len(loaders.text.vocab)
num_classes = len(loaders.label.vocab)
print("|vocab| = ", vocab_size, "|classes| = ", num_classes )

|train| =  3722 |valid| =  930
|vocab| =  1545 |classes| =  2


## 3. 데이터 로드함수

학습시킬 때 batch_size 단위로 끊어서 로드하기 위함

### 데이터 로드함수 이해하기

In [20]:
n = 3 # 샘플로 출력할 데이터 개수
for i, data in enumerate(loaders.train_loader):
    labels = data.label
    texts = data.text
    
    if i>n :
        break
    print("[%d]"%i)
    print("한 번에 로드되는 데이터 크기: ", len(labels))
    
    # 출력
    for j in range(n):
        label = labels[j].numpy() # tensor -> numpy 로 변환
        text = texts[j].numpy()
        print("label: ", label)
        print("text: ", text.shape)
        
        

[0]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (16,)
label:  0
text:  (16,)
label:  0
text:  (16,)
[1]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (28,)
label:  1
text:  (28,)
label:  1
text:  (28,)
[2]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (11,)
label:  0
text:  (11,)
label:  0
text:  (11,)
[3]
한 번에 로드되는 데이터 크기:  128
label:  0
text:  (9,)
label:  0
text:  (9,)
label:  0
text:  (9,)


## 4. 모델 선언

In [38]:
# Reccurent neural network (many to one)
class RNN(nn.Module):
    def __init__(self,
                 input_size, # vocab_size
                 word_vec_size, # word embedding vector 차원
                 hidden_size, # bidirectional LSTM의 hidden state , cell state
                 n_classes,
                 num_layers = 4, # 쌓을 레이어 개수
                 dropout_p = 0.3
                 ):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.word_vec_size = word_vec_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        self.num_layers = num_layers
        self.dropout_p = dropout_p
        
        # 입력 차원(vocab_size), 출력 차원(word_vec_size)
        self.emb = nn.Embedding(input_size, word_vec_size) # 부터
        
        self.lstm = nn.LSTM(input_size = word_vec_size,
                           hidden_size = hidden_size,
                           num_layers = num_layers,
                           dropout = dropout_p, #얼만큼 끌지, 디폴트 0
                           batch_first = True,
                           bidirectional = True)
        
        self.fc = nn.Linear(hidden_size *2, num_classes)
        
        # LogSoftmax + NLLLoss instead of Softmax + CrossEntropy
        
        self.activation = nn.LogSoftmax(dim = -1) # 마지막 차원에 softmax 씌워줌
        
    def forward(self, x):
        # 빈칸 뚫기 좋음
        # x: (batch_size, length)
        x = self.emb(x)

        # x: (batch_size, length, word_vec_size)
        x, _ = self.lstm(x) # x: output, _ : 마지막 time step의 fidden state % call state

        # x: (batch_size, length, hidden_size *2)
        # x[:,-1]: (batch_size, 1, hidden_size *2)
        out = self.activation(self.fc(x[:,-1])) # 마지막 time step
        # self.fc(x[:,-1]) : (batch_size, num_classes)


        return out

In [39]:
model = RNN(input_size = vocab_size, 
           word_vec_size = word_vec_size,
           hidden_size = hidden_size,
           n_classes = num_classes,
           num_layers = num_layers,
           dropout_p = dropout_p)

In [47]:
def ComputeAccr(dloader, imodel):
    correct = 0
    total = 0
    
    model.eval()
    
    for i, data in enumerate(dloader): # batch_size 만큼
        texts = data.text.to(device) # (batch_size, length)
        labels = data.label.to(device)
        
        # Forward prop
        output = model(texts) # (batch_size, num_classes)
        _, output_index = torch.max(output,1) # (batch_size, 1)
        
        total += labels.size(0)
        correct += (output_index == labels).sum().float()
        
        
        model.train()
        return (100*correct/total).numpy()

In [48]:
print("Accuracy of Test Data : %.2f"%ComputeAccr(loaders.valid_loader, model))

Accuracy of Test Data : 7.81


## 5. loss, optimizer

In [49]:
loss_func = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## 6. 학습

In [52]:
total_step = len(loaders.train_loader)

for epoch in range(num_epochs):
    for i, data in enumerate(loaders.train_loader):
        texts = data.text.to(device) # (batch_size, length)
        labels = data.label.to(device) # (batch_size, num_classes)
        
        print("[%d]"%i)
        
        # Forward prop
        
        outputs = model(texts)
        loss = loss_func(outputs, labels)
        
        # Backward prop. & optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) %10 == 0 :
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accr: {:.2f}'
                  .format(epoch+1, num_epochs, i+1, total_step,
                          loss.item(),
                          ComputeAccr(loaders.valid_loader, model)))

[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [1/10], Step [10/30], Loss: 0.1693, Accr: 97.66
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [1/10], Step [20/30], Loss: 0.1605, Accr: 97.66
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [1/10], Step [30/30], Loss: 0.8558, Accr: 97.66
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [2/10], Step [10/30], Loss: 0.7626, Accr: 97.66
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [2/10], Step [20/30], Loss: 0.3933, Accr: 97.66
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [2/10], Step [30/30], Loss: 0.2604, Accr: 97.66
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [3/10], Step [10/30], Loss: 0.0873, Accr: 97.66
[10]
[11]
[12]
[13]
[14]
[15]
[16]
[17]
[18]
[19]
Epoch [3/10], Step [20/30], Loss: 0.1825, Accr: 97.66
[20]
[21]
[22]
[23]
[24]
[25]
[26]
[27]
[28]
[29]
Epoch [3/10], Step [30/30], Loss: 0.5478, Accr: 97.66
[0]
[1]
[2]
[3]
[4]
[5]
[6]
[7]
[8]
[9]
Epoch [4/10], Step [10/30], Loss: 0.2131, Accr: 97.66


## 7. 테스트

In [53]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 97.66


## 8. 학습된 파라미터 저장

In [55]:
netname = './rnn_weight.pkl'
torch.save(model, netname,)

## 9. 학습된 파라미터 로드

실무에서 학습된 파라미터 로드하고 싶다면 5,6,8 과정 생략한 채 실행

In [56]:
netname = './rnn_weight.pkl'
model = torch.load(netname)

In [57]:
print("Accuracy of Valid Data: %.2f" %ComputeAccr(loaders.valid_loader, model))

Accuracy of Valid Data: 97.66
