In [1]:
! pip install sentencepiece



# IMDB 리뷰 토큰화하기

In [2]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

('IMDb_Reviews.csv', <http.client.HTTPMessage at 0x7bf3e1dc1650>)

In [4]:
train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']

Unnamed: 0,review
0,My family and I normally do not watch local mo...
1,"Believe it or not, this was at one time the wo..."
2,"After some internet surfing, I found the ""Home..."
3,One of the most unheralded great works of anim...
4,"It was the Sixties, and anyone with long hair ..."
...,...
49995,the people who came up with this are SICK AND ...
49996,"The script is so so laughable... this in turn,..."
49997,"""So there's this bride, you see, and she gets ..."
49998,Your mind will not be satisfied by this nobud...


In [5]:
print('리뷰 개수 :',len(train_df)) # 리뷰 개수 출력

리뷰 개수 : 50000


In [6]:
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))

In [7]:
spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

In [8]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(10)

Unnamed: 0,0,1
1704,iculous,-1701
75,im,-72
2150,ously,-2147
3293,▁lau,-3290
927,ject,-924
1716,eth,-1713
3633,▁magn,-3630
2571,▁cheesy,-2568
3929,▁gorgeous,-3926
4166,▁depart,-4163


In [9]:
len(vocab_list)

5000

In [10]:
sp = spm.SentencePieceProcessor()
vocab_file = "imdb.model"
sp.load(vocab_file)

True

In [11]:
lines = [
  "I didn't at all think of it this way.",
  "I have waited a long time for someone to film"
]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 624, 4950, 4926, 139, 170, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]



In [12]:
sp.GetPieceSize()

5000

In [13]:
sp.IdToPiece(430)

'▁character'

In [14]:
sp.PieceToId('▁character')

430

In [15]:
sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91])

'Iul wa fall aold timeooland to film'

In [17]:
sp.DecodePieces(['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film'])

'I have waited a long time for someone to film'

In [18]:
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int))

['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 142, 1364, 1121, 4, 668, 285, 93, 1079, 33, 91]
