Minje Kim made a short but great [tutorial](https://github.com/mjc92/TorchTextTutorial/blob/master/01.%20Getting%20started.ipynb) on torchtext


In [1]:
import torch
from torchtext.vocab import Vocab
from torchtext import data
from konlpy.tag import Mecab
import re

In [2]:
train_file_path = './datasets/naver_train.txt'
test_file_path = './datasets/naver_test.txt'

In [3]:
mecab = Mecab()

In [4]:
# hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
hangul = re.compile('[^ ㅋㅎ가-힣]+')
def clean(sentence):
    clean_sentence = hangul.sub('', sentence)
    return clean_sentence
clean('안녕하세요 반갑습니다.?ㄴㅇㄷㅋㅎ ')

'안녕하세요 반갑습니다ㅋㅎ '

In [5]:
def mecab_tokenizer(sentence):
    out_list = []
    for word, pos in mecab.pos(sentence):
        out_list.append(word)
    return out_list

In [6]:
mecab_tokenizer('안녕하세요 반갑습니다.')

['안녕', '하', '세요', '반갑', '습니다', '.']

In [7]:
def tokenizer(sentence):
    clean_sentence = clean(sentence)
    tokens = mecab_tokenizer(clean_sentence)
    return tokens

In [8]:
tokenizer('안녕하세요 반갑습니다.?ㄴㅇㄷㅎㅎㅋ ')

['안녕', '하', '세요', '반갑', '습니다', 'ㅎ', 'ㅎ', 'ㅋ']

In [9]:
import pandas as pd

Reading **tsv** with `pd.read_csv` & `delimeter=\t`

In [10]:
df = pd.read_csv(test_file_path, delimiter='\t', keep_default_na=False)
df[:10]

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
5,7898805,"음악이 주가 된, 최고의 음악영화",1
6,6315043,진정한 쓰레기,0
7,6097171,"마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다",0
8,8932678,갈수록 개판되가는 중국영화 유치하고 내용없음 폼잡다 끝남 말도안되는 무기에 유치한c...,0
9,6242223,"이별의 아픔뒤에 찾아오는 새로운 인연의 기쁨 But, 모든 사람이 그렇지는 않네..",1


### Define Field

In [11]:
text_field = data.Field(tokenize=tokenizer, sequential=True)
label_field = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(int))

### Filter out invalid examples (rows)
`filter_pred`: Use only examples for which filter_pred(ex) is True, or use all examples if None. Default: None

In [12]:
def filter_pred(example):
    if example.label in ['0', '1']:
        if len(example.text) > 1:
            return True
    return False

In [13]:
naver_train = data.TabularDataset(
    path=train_file_path,
    format='tsv',
    fields=[
        ('id', None),
        ('text', text_field),
        ('label', label_field)
    ],
    filter_pred=filter_pred
)

naver_test = data.TabularDataset(
    path=test_file_path,
    format='tsv',
    fields=[
        ('id', None),
        ('text', text_field),
        ('label', label_field)
    ],
    filter_pred=filter_pred
)

In [14]:
naver_test[:5]

[<torchtext.data.example.Example at 0x117c17e10>,
 <torchtext.data.example.Example at 0x117c37550>,
 <torchtext.data.example.Example at 0x117c37080>,
 <torchtext.data.example.Example at 0x117c377b8>,
 <torchtext.data.example.Example at 0x116695898>]

In [15]:
len(naver_train)

146035

In [16]:
naver_test.examples[0]

<torchtext.data.example.Example at 0x117c17e10>

In [17]:
vars(naver_train.examples[0])

{'label': '0', 'text': ['아', '더', '빙', '진짜', '짜증', '나', '네요', '목소리']}

In [18]:
vars(naver_test.examples[0])

{'label': '1', 'text': ['굳', 'ㅋ']}

In [19]:
vars(naver_test.examples[10])

{'label': '0',
 'text': ['한국', '독립영화', '의', '한계', '그렇게', '아버지', '가', '된다', '와', '비교', '됨']}

### Build vocabulary


In [20]:
text_field.build_vocab(naver_train)

In [21]:
vocab = text_field.vocab

In [22]:
len(vocab)

49357

In [23]:
vocab.freqs

Counter({'아': 8889,
         '더': 4919,
         '빙': 77,
         '진짜': 8355,
         '짜증': 1487,
         '나': 12916,
         '네요': 8974,
         '목소리': 370,
         '흠': 231,
         '포스터': 562,
         '보고': 627,
         '초딩': 420,
         '영화': 57648,
         '줄': 2912,
         '오버': 132,
         '연기': 6814,
         '조차': 285,
         '가볍': 248,
         '지': 18873,
         '않': 7727,
         '구나': 803,
         '너무': 11027,
         '재': 1760,
         '밓었': 1,
         '다': 54979,
         '그래서': 485,
         '보': 25386,
         '는': 66815,
         '것': 9263,
         '을': 29887,
         '추천': 1179,
         '한다': 1782,
         '교도소': 11,
         '이야기': 2199,
         '구먼': 34,
         '솔직히': 1074,
         '재미': 4182,
         '없': 16070,
         '평점': 6261,
         '조정': 44,
         '사이몬페그': 2,
         '의': 33615,
         '익살': 11,
         '스런': 222,
         '가': 33339,
         '돋보였': 60,
         '던': 5954,
         '스파이더맨': 58,
         '에서': 80

In [24]:
vocab.itos[10]

'가'

In [25]:
vocab.stoi['생활']

1395

### Use frequent words only

In [26]:
frequent_vocab = Vocab(counter=vocab.freqs, min_freq=5)

In [27]:
len(frequent_vocab)

15291

In [28]:
train_iter = data.Iterator(
    dataset=naver_train,
    batch_size=10,
    sort_key=lambda x: len(x.text),
    train=True, # if training set => repeat and shuffle : True 
    device=-1 # CPU: -1
)

In [29]:
test_iter = data.Iterator(
    dataset=naver_test,
    batch_size=10,
    sort=False,
    train=False,
    device=-1)

In [30]:
vars(next(iter(test_iter)))

{'batch_size': 10,
 'dataset': <torchtext.data.dataset.TabularDataset at 0x11656aef0>,
 'label': Variable containing:
  1
  0
  0
  0
  1
  0
  0
  0
  1
  1
 [torch.LongTensor of size 10],
 'text': Variable containing:
    809     89     92     25    237    564   1046    534   2765    195
    162    117      7     76      2     15    458   1204      9     36
      1      2     18     24     59    111    681     52   1353    390
      1     62      3    418     10      1     44     10    581   7996
      1     17     48    241    159      1  13431      3     12    104
      1     11    234   2119     60      1     75    644  11045  36765
      1    899    138    131      9      1     15      4      3      1
      1    331    340    546    237      1   4628    230    777      1
      1     48    132     13      4      1   1707      7   3025      1
      1     40    151    527      1      1     21      6      9      1
      1     46     59     70      1      1      3     94   3830      1

In [31]:
vars(next(iter(train_iter)))

{'batch_size': 10,
 'dataset': <torchtext.data.dataset.TabularDataset at 0x116556128>,
 'label': Variable containing:
  0
  1
  0
  1
  1
  1
  1
  1
  0
  0
 [torch.LongTensor of size 10],
 'text': Variable containing:
    186   1412    278   2251   3202     31    520     58    267      4
     25    337    179    227    302    101     54     93   2750     93
   1627     13     19     14      9     67    520     39      1     39
    273   1882     29     45    127   1335    156    331      1   3437
     57      7    412     22    164    725     51     77      1   3698
     30    115      7     36      1    596      8    167      1      7
    642  11518    126    133      1     13     20   1271      1    305
    297    102    179     30      1     43      3     10      1      3
      5      2      1      7      1     26    523    361      1   8262
    781   1352      1      6      1     31    172     43      1      6
    642     64      1     14      1  12383   8172     64      1    230