In [1]:
import torch
from torch import nn, optim

In [9]:
emb = nn.Embedding(10000, 20, padding_idx = 0)  # Word2Vec : CBOW, Skip-Gram 
inp = torch.tensor([1,2,5,2,10], dtype = torch.int64) ### Extract by Index concept
out = emb(inp)

In [11]:
import glob
import pathlib
import re

In [20]:
### Need Check
#remove_marks_regex = re.compile("[,|.|(|)|[|]|*:;]|<.*?>")
remove_marks_regex = re.compile("[,|.|(|)|[|]]|<.*?>")
shift_marks_regex = re.compile("([?!])")

In [22]:
def text2ids(text, vocab_dict):
    # !? 이외의 기호 삭제
    text = remove_marks_regex.sub("", text)
    # !?와 단어 사이에 공백 삽입
    text = shift_marks_regex.sub(r" |1 ", text)
    
    tokens = text.split()
    
    return [vocab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len = 100, padding = True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    
    n_tokens = len(token_idxes)
    
    if padding : 
        token_idxes = token_idxes + [0]*(max_len - len(token_idxes))
        
    return torch.tensor(token_idxes, dtype = torch.int64), n_tokens

        
        

In [24]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)

In [27]:
class IMDBDataset(Dataset):
    def __init__(self, dir_path, train = True, max_len = 100, padding = True):
        self.max_len = max_len
        self.padding = padding
        
        ### ????? ####
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath("imdb.vocab")
        
        self.vocab_array = vocab_path.open().read().strip().splitlines()
        
        self.vocab_dict = dict((w, i + 1) for (i, w) in enumerate(self.vocab_array))
        
        if train :
            target_path = path.joinpath("train")
        else :
            target_path = path.joinpath("test")
            
        pos_files = sorted(glob.glob(str(target_path.joinpath("pos/*.txt"))))
        neg_files = sorted(glob.glob(str(target_path.joinpath("neg/*.txt"))))
        
        # Pos : 1, Neg : 0 Label부여서
        # (file_path, label) Tuple List 작성
        
        self.labeled_files = list(zip([0]*len(neg_files), neg_files)) + list(zip([1]*len(pos_files), pos_files))
    
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        
        data = open(f).read().lower()
        
        data = text2ids(data, self.vocab_dict)
        
        data, n_tokens = list2tensor(data, self.max_len, self.padding)
        
        return data, label, n_tokens
        
    

In [28]:
class SequenceTaggingNet(nn.Module):
    def __init__(self, num_embeddings, embedding_dim = 50, hidden_size = 50, num_layers = 1, dropout = 0.2):
        super().__init__()
        
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx = 0)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first = True, dropout = dropout)
        
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self,x, h0 = None, l = None):
        # ID를 Embedding 벡터로 변환
        # x는 (batch_size, step_size)
        # -> (batch_size, step_size, embdedding_dim)
        x = self.emb(x)
        # x는 (batch_size, step_size, embedding_dim)
        # -> (batch_size, step_size, hidden_dim)
        x, h = self.lstm(x, h0)
        # x는 (batch_size, step_size, hidden_dim)
        # -> (batch_size, 1)
        
        if l is not None :
            x = x[list(range(len(x))), l -1, :]
        else :
            x = x[:, -1, :]
            
        s = self.linear(x)
        
        # (batch_size, 1) -> (batch_size, )
        x = x.squeeze()
        
        return x