In [1]:
from google.colab import auth
auth.authenticate_user()

from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
from pathlib import Path

folder = "colab/movie_pred_git" ## 자기 드라이브 경로 입력

base_path = Path("/content/gdrive/My Drive/")
project_path = base_path / folder
os.chdir(project_path)
for x in list(project_path.glob("*")):
    if x.is_dir():
        dir_name = str(x.relative_to(project_path))
        os.rename(dir_name, dir_name.split(" ", 1)[0])
print(f"{os.getcwd()}")

/content/gdrive/My Drive/colab/movie_pred_git


# 패키지, 데이터 로딩, 하이퍼 파라미터 설정

In [0]:
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import torch.nn as nn
import torch
import pickle

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [0]:
with open ('data/train.txt', 'rb') as f:
    train_data = pickle.load(f)
with open ('data/test.txt', 'rb') as f:
    test_data = pickle.load(f)

In [0]:
### 하이퍼 파라미터 세팅
batch_size = 100
learning_rate = 0.0005
num_epochs = 30

# max_len = 50
embed_dim = 300
hid_dim = 300

print_every = 250
example_every = 1000
save_every = 10

# 딕셔너리, 데이터셋 및 bi-lstm 모델 구현

In [0]:
class Dictionary(object):
    
    def __init__(self, train_data):
        self.word2ix = {'<pad>': 0, '<UNK>':1} ## <pad>: 패딩, <UNK>: unknown
        self.ix2word = {0: '<pad>', 1:'<UNK>'}
        self.wordcnt = dict()      
        self.cnt = 1
        
        print('Making dict..')
        for i in train_data["document"]:
            for word in i.split():
                if word not in self.word2ix:
                    self.word2ix[word] = self.cnt
                    self.ix2word[self.cnt] = word
                    self.wordcnt[word] = 0
                    self.cnt += 1
                else:
                    self.wordcnt[word] += 1  
        print("Complete")

In [7]:
train_dict = Dictionary(train_data)

Making dict..
Complete


In [0]:
class MovieData(Dataset):
    
    def __init__(self, data, dictionary, max_len=50):
        super(MovieData, self).__init__()
        self.dataset = data
        self.dictionary = dictionary
        self.max_len = max_len
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, ix):
        text = [self.dictionary.word2ix[word] if word in self.dictionary.word2ix.keys() else 1 for word in self.dataset["document"][ix].split()]
        label = self.dataset["label"][ix]
        text_len = len(text)
        if len(text) < self.max_len:
            text += [0 for i in range(self.max_len - len(text))]
        elif len(text) > self.max_len:
            text = text[:self.max_len]
            text_len = self.max_len    
        
        text = torch.LongTensor(text)
        return text, text_len, label

In [0]:
class Bi_LSTM(nn.Module):
    
    def __init__(self, num_text, embed_dim, hid_dim, fc_hid_dim = 300, max_len = 50,
                 dropout=0.1, num_layers=1, batch_first=True, bidirectional=True):
        
        super(Bi_LSTM, self).__init__()
        

        self.text_embed = nn.Embedding(num_text, embed_dim)
        
        self.lstm = nn.LSTM(embed_dim, hid_dim, dropout=dropout, num_layers=num_layers, batch_first=batch_first, bidirectional=bidirectional)

        self.fc_dim = hid_dim * 2 if bidirectional else hid_dim * 1
        self.fc = nn.Sequential(nn.Linear(self.fc_dim, fc_hid_dim), nn.Linear(fc_hid_dim, 2))
        
    def forward(self, text, text_len):
        text = self.text_embed(text)

        text = nn.utils.rnn.pack_padded_sequence(text, text_len, 
                                                batch_first=True, enforce_sorted=False)
        out, (hidden, cell) = self.lstm(text)

        out, out_len = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        for i in range(len(text_len)):
          out[i][-1] = out[i][text_len[i]-1]

        out = self.fc(out[:,-1,:])

        out = torch.sigmoid(out)
        return out

# 학습

In [0]:
train_dataset = MovieData(train_data, train_dict)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

In [11]:
model = Bi_LSTM(len(train_dict.word2ix), embed_dim, hid_dim).to(device)

  "num_layers={}".format(dropout, num_layers))


In [0]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

In [0]:
def save_checkpoint(epoch, model, optimizer):
  torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, "model/model_epoch"+str(epoch) + ".tar")

In [0]:
def train(num_epochs, model, data_loader, criterion, optimizer):
    print('Training..')
    for epoch in range(num_epochs):
        for i, (texts, text_lens, labels) in enumerate(data_loader):

            texts, text_lens, labels = texts.to(device), text_lens.to(device), labels.to(device)
            outputs = model(texts, text_lens)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step() 

            ## accuracy 계산
            _, argmax = torch.max(outputs, 1)
            accuracy = (labels == argmax).float().mean()

            if (i+1) % print_every == 0: ## 상태 출력
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'.format(
                    epoch+1, num_epochs, i+1, len(data_loader), loss.item(), accuracy.item() * 100))
                
            if (i+1) % example_every == 0: ## csv 파일로 현재 모델 출력 표시
              if not os.path.exists("example"):
                os.makedirs("example")
              ex = pd.DataFrame()
              ex["label"] = labels.to('cpu')
              ex["pred"] = argmax.to('cpu')
              text_lens=text_lens.to('cpu')

              tmp_text = []
              for j in range(batch_size):
                tmp_txt = ""
                for k in range(text_lens[j]):
                  tmp_txt += train_dict.ix2word[texts.tolist()[j][k]]+" "
                tmp_text.append(tmp_txt)
              ex["document"] = tmp_text
              ex.to_csv("example/example_epoch"+str(epoch+1)+"_iter"+str(i)+".csv", mode='w')
              print("example saved!")
            
        ## checkpoint 저장
        if (epoch+1) % save_every == 0:
           if not os.path.exists("model"):
             os.makedirs("model")
           save_checkpoint(epoch+1, model, optimizer)
           print("checkpoint saved!")

In [15]:
train(num_epochs, model, train_loader, criterion, optimizer)

Training..
Epoch [1/30], Step [250/1487], Loss: 0.5328, Accuracy: 75.00%
Epoch [1/30], Step [500/1487], Loss: 0.5248, Accuracy: 77.00%
Epoch [1/30], Step [750/1487], Loss: 0.4827, Accuracy: 81.00%
Epoch [1/30], Step [1000/1487], Loss: 0.4690, Accuracy: 83.00%
example saved!
Epoch [1/30], Step [1250/1487], Loss: 0.5065, Accuracy: 78.00%
Epoch [2/30], Step [250/1487], Loss: 0.4292, Accuracy: 89.00%
Epoch [2/30], Step [500/1487], Loss: 0.3980, Accuracy: 91.00%
Epoch [2/30], Step [750/1487], Loss: 0.4976, Accuracy: 82.00%
Epoch [2/30], Step [1000/1487], Loss: 0.4168, Accuracy: 91.00%
example saved!
Epoch [2/30], Step [1250/1487], Loss: 0.4925, Accuracy: 81.00%
Epoch [3/30], Step [250/1487], Loss: 0.4363, Accuracy: 86.00%
Epoch [3/30], Step [500/1487], Loss: 0.4523, Accuracy: 85.00%
Epoch [3/30], Step [750/1487], Loss: 0.4113, Accuracy: 90.00%
Epoch [3/30], Step [1000/1487], Loss: 0.4155, Accuracy: 90.00%
example saved!
Epoch [3/30], Step [1250/1487], Loss: 0.3528, Accuracy: 96.00%
Epoch [4

# 테스트

In [0]:
def test(model, data_loader):
    print('Testing..')
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for i, (texts, text_lens, labels) in enumerate(data_loader):
            texts, text_lens, labels = texts.to(device), text_lens.to(device), labels.to(device)
            outputs = model(texts, text_lens)

            _, argmax = torch.max(outputs, 1)
            total += texts.size(0)
            correct += (labels == argmax).sum().item()

        print('Test accuracy for {} texts: {:.2f}%'.format(total, correct / total * 100))
    model.train() ## 다시 돌려놓기

In [0]:
test_dataset = MovieData(test_data, train_dict)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

In [18]:
checkpoint = torch.load("model/model_epoch30.tar")
model = Bi_LSTM(len(train_dict.word2ix), embed_dim, hid_dim).to(device)
model.load_state_dict(checkpoint['model_state_dict']) 

  "num_layers={}".format(dropout, num_layers))


<All keys matched successfully>

In [19]:
test(model, test_loader)

Testing..
Test accuracy for 49518 texts: 84.61%
