Minje Kim made a short but great [tutorial](https://github.com/mjc92/TorchTextTutorial/blob/master/01.%20Getting%20started.ipynb) on torchtext


In [1]:
import torch
from torchtext.vocab import Vocab
from torchtext import data
from konlpy.tag import Mecab
import re

In [12]:
train_file_path = './data/naver_train.txt'
test_file_path = './data/naver_test.txt'

In [13]:
mecab = Mecab()

In [14]:
# hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
hangul = re.compile('[^ ㅋㅎ가-힣]+')
def clean(sentence):
    clean_sentence = hangul.sub('', sentence)
    return clean_sentence
clean('안녕하세요 반갑습니다.?ㄴㅇㄷㅋㅎ ')

'안녕하세요 반갑습니다ㅋㅎ '

In [15]:
def mecab_tokenizer(sentence):
    out_list = []
    for word, pos in mecab.pos(sentence):
        out_list.append(word)
    return out_list

In [16]:
mecab_tokenizer('안녕하세요 반갑습니다.')

['안녕', '하', '세요', '반갑', '습니다', '.']

In [17]:
def tokenizer(sentence):
    clean_sentence = clean(sentence)
    tokens = mecab_tokenizer(clean_sentence)
    return tokens

In [18]:
tokenizer('안녕하세요 반갑습니다.?ㄴㅇㄷㅎㅎㅋ ')

['안녕', '하', '세요', '반갑', '습니다', 'ㅎ', 'ㅎ', 'ㅋ']

In [19]:
import pandas as pd

Reading **tsv** with `pd.read_csv` & `delimeter=\t`

In [20]:
df = pd.read_csv(test_file_path, delimiter='\t', keep_default_na=False)
df[:10]

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
5,7898805,"음악이 주가 된, 최고의 음악영화",1
6,6315043,진정한 쓰레기,0
7,6097171,"마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다",0
8,8932678,갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한c...,0
9,6242223,"이별의 아픔뒤에 찾아오는 새로운 인연의 기쁨 But, 모든 사람이 그렇지는 않네..",1


### Define Field

In [21]:
text_field = data.Field(tokenize=tokenizer, sequential=True)
label_field = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(int))

### Filter out invalid examples (rows)
`filter_pred`: Use only examples for which filter_pred(ex) is True, or use all examples if None. Default: None

In [22]:
def filter_pred(example):
    if example.label in ['0', '1']:
        if len(example.text) > 1:
            return True
    return False

In [23]:
naver_train = data.TabularDataset(
    path=train_file_path,
    format='tsv',
    fields=[
        ('id', None),
        ('text', text_field),
        ('label', label_field)
    ],
    filter_pred=filter_pred
)

naver_test = data.TabularDataset(
    path=test_file_path,
    format='tsv',
    fields=[
        ('id', None),
        ('text', text_field),
        ('label', label_field)
    ],
    filter_pred=filter_pred
)

In [None]:
naver_test[:5]

In [None]:
len(naver_train)

In [None]:
naver_test.examples[0]

In [None]:
vars(naver_train.examples[0])

In [None]:
vars(naver_test.examples[0])

In [None]:
vars(naver_test.examples[10])

### Build vocabulary


In [24]:
text_field.build_vocab(naver_train)

In [26]:
vocab = text_field.vocab

In [None]:
len(vocab)

In [25]:
vocab.freqs

NameError: name 'vocab' is not defined

In [None]:
vocab.itos[10]

In [None]:
vocab.stoi['생활']

### Use frequent words only

In [None]:
frequent_vocab = Vocab(counter=vocab.freqs, min_freq=5)

In [None]:
len(frequent_vocab)

In [27]:
train_iter = data.Iterator(
    dataset=naver_train,
    batch_size=10,
    sort_key=lambda x: len(x.text),
    train=True, # if training set => repeat and shuffle : True 
    device=-1 # CPU: -1
)

In [28]:
test_iter = data.Iterator(
    dataset=naver_test,
    batch_size=10,
    sort=False,
    train=False,
    device=-1)

In [None]:
vars(next(iter(test_iter)))

In [29]:
a = next(iter(train_iter))

In [30]:
a

<torchtext.data.batch.Batch at 0x11a056f28>

In [31]:
vars(a)

{'batch_size': 10,
 'dataset': <torchtext.data.dataset.TabularDataset at 0x1134a34e0>,
 'label': Variable containing:
  0
  1
  1
  0
  1
  1
  1
  1
  1
  0
 [torch.LongTensor of size 10],
 'text': Variable containing:
    495   1327    413    957   6830   4459    196   5635     22    725
   1232      2     34     86     29   9826    115      7      5    118
    722     58   1551      4     55      9     62    115   1166    746
     10  35751     80    103     35    127   9453    440     74      3
     20      1      2    154     25   1300      4      2   5663    341
   1323      1    467      9   7411      2    142    632     74      6
    267      1     29      4    297   1053    597     16    289    184
   4314      1     34    742     81     18      1      7    194    118
      1      1    496     82     37     18      1    115      1    746
      1      1  10301      9     24     48      1    257      1      3
      1      1     74   1571     13    283      1      8      1    436

In [33]:
a.text

Variable containing:
   495   1327    413    957   6830   4459    196   5635     22    725
  1232      2     34     86     29   9826    115      7      5    118
   722     58   1551      4     55      9     62    115   1166    746
    10  35751     80    103     35    127   9453    440     74      3
    20      1      2    154     25   1300      4      2   5663    341
  1323      1    467      9   7411      2    142    632     74      6
   267      1     29      4    297   1053    597     16    289    184
  4314      1     34    742     81     18      1      7    194    118
     1      1    496     82     37     18      1    115      1    746
     1      1  10301      9     24     48      1    257      1      3
     1      1     74   1571     13    283      1      8      1    436
     1      1     18    397     75     62      1    632      1    119
     1      1    180    350    300      2      1     16      1      6
     1      1    413     44    547    201      1      7      1    429