In [1]:
import torch
from torchtext.legacy import data
from torchtext.legacy import datasets

TEXT = data.Field(batch_first=True,
                  fix_length=500,
                  tokenize=str.split,
                  pad_first=True,
                  pad_token='[PAD]',
                  unk_token='[UNK]')
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(text_field=TEXT, label_field=LABEL)

# batch_first == batch size를 data shape axis의 가장 앞으로 설정
# fix_length == sentence의 길이를 미리 제한하는 옵션
# tokenize == tokenize를 설정하는 옵션, 여기서는 파이썬의 string.split (그냥 띄어쓰기로 짜름)
# pad_first == fix_lenght 대비 짧은 문장의 경우 padding을 앞에서 줄 것인지에 대한 옵션
# pad_token == padding을 할 때 사용할 token
# unk_token == token dict에 없는 token이 나올 경우 token을 표현하는 특수 token
# dtype == 가져올 데이터에 대한 type 설정 옵션

In [2]:
print(f"Train Data Length : {len(train_data.examples)}")
print(f"Test Data Length : {len(test_data.examples)}")

print(train_data.fields)

print('--- Data Sample ---')
print('Input : ')
print(' '.join(vars(train_data.examples[1])['text']),'\n')
print(f'Label : {vars(train_data.examples[1])["label"]}')
print(vars(train_data.examples[1])['text'],'\n')

Train Data Length : 25000
Test Data Length : 25000
{'text': <torchtext.legacy.data.field.Field object at 0x10d7790a0>, 'label': <torchtext.legacy.data.field.LabelField object at 0x10d779100>}
--- Data Sample ---
Input : 
Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV's "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina's pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inhabited by a blind priest on the top floor), things really start cooking. The neighbors, including a fantastically wicked Burgess Meredith and kinky couple Sylvia Miles & Beverly D'Angelo, are a diabolical lot, and Eli Wallach is great fun as a wily police detective. The movie is nearly a cross-pollination of "Rosemary's Baby" and "T

In [3]:
import re

def PreProcessingText(input_sentence) :
    input_sentence = input_sentence.lower()
    input_sentence = re.sub('<[^>]*>', repl=' ', string=input_sentence) # <br /> 처리
    input_sentence = re.sub('[!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~]', repl= ' ', string = input_sentence) # 특수문자 처리 ("'" 제외)
    input_sentence = re.sub('\s+', repl= ' ', string = input_sentence) # 연속된 띄어쓰기 처리
    if input_sentence:
        return input_sentence

for example in train_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

for example in test_data.examples :
    vars(example)['text'] = PreProcessingText(' '.join(vars(example)['text'])).split()

In [4]:
# pre_trained
TEXT.build_vocab(train_data, min_freq=2, max_size=None, vectors = 'glove.6B.300d')
# min_freq == vocab에 해당하는 token에 최소한으로 등자하는 횟수에 제한을 둔다.
# max_size == vocab size 에 제한을 둔다
# vecotrs == pre-trained vector를 가져와 vocab을 세팅한다.
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:48, 5.11MB/s]                               
100%|█████████▉| 399999/400000 [00:24<00:00, 16132.83it/s]


In [5]:
print(f"Vocab size : {len(TEXT.vocab)}")
print('Vocab Examples :')
for idx, (k, v) in enumerate(TEXT.vocab.stoi.items()):
    if idx >= 10 :
        break
    print('\t', k, v)
print('---------------------')

print(f'Label Size : {len(LABEL.vocab)}')
print('Label Examples : ')
for idx, (k, v) in enumerate(LABEL.vocab.stoi.items()):
    print('\t', k, v)

Vocab size : 51956
Vocab Examples :
	 [UNK] 0
	 [PAD] 1
	 the 2
	 and 3
	 a 4
	 of 5
	 to 6
	 is 7
	 in 8
	 it 9
---------------------
Label Size : 2
Label Examples : 
	 neg 0
	 pos 1


In [6]:
import random
train_data, valid_data = train_data.split(random_state=random.seed(0),
                                          split_ratio=0.8)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    datasets=(train_data, valid_data, test_data), batch_size=30, device=DEVICE)