# Bag of Words tutorial, part 2
    
    튜토리얼 part1에서 사용했던 BoW 대신,
    Google에 Word2Vec 알고리즘을 사용하여 학습데이터를 구성.
    Word2Vec은 "Distributed Representation"을 기반으로 한다.
    Distributed Representation은 비지도 학습을 사용한 모델이다.
    BOW는 데이터의 반복수 최상위 n개의 단어 Vocabulary,
    vocabulary에 대한 각 문장의 단어들에 대한 빈도를 이용하여
    feature vector를 만드는 것과 달리,
    Distributed Represantation은 데이터셋 전체에 있는 단어들로
    단어들 서로간의 연관관계를 구성한 것이다.
    즉, 각 단어에 연관된 단어 세트가 구성된다.
    (flower => plant, pretty, stem, leaf)

## import

In [35]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import logging
from gensim.models import word2vec



## 데이터 로딩

In [14]:
train = pd.read_csv("./data/bow/labeledTrainData.tsv", 
                    sep="\t", quoting=3)
test = pd.read_csv("./data/bow/testData.tsv", 
                    sep="\t", quoting=3)
unlabeled_train = pd.read_csv("./data/bow/unlabeledTrainData.tsv", 
                    sep="\t", quoting=3)

In [10]:
train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [11]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [15]:
unlabeled_train.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


In [16]:
train.shape

(25000, 3)

In [17]:
test.shape

(25000, 2)

In [18]:
unlabeled_train.shape

(50000, 2)

## 문장 => 단어 리스트 함수, 단어 리스트 => 문장 변환 함수

In [29]:
def review_to_wordlist(review, remove_stopwords=False):
    # 1. remove HTML
    review_text = BeautifulSoup(review).get_text()
    # 2. remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. convert words to lower case and split
    words = review_text.lower().split()
    # 4. optional: remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # 5. return a list of words
    return(words)

In [30]:
# 문단/단락 => 문장 => 단어 리스트
# 1. nltk의 punkt tokenizer를 사용한다.
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
# 2. 문단/단락을 문장으로 바꾸는 함수 생성
def review_to_sents(review, tokenizer, remove_stopwords=False):
    # a. use tokenizer
    raw_sents = tokenizer.tokenize(review.strip())
    # b. loop over each sentence
    sentences = []
    for sent in raw_sents:
        # if a sent empty, skip
        if len(sent) > 0:
            # otherwise, use review_to_wordlist
            sentences.append(review_to_wordlist(sent,remove_stopwords))
    return sentences

In [31]:
sentences = []

print("Parsing sentences from training set")

for review in train["review"]:
    sentences += review_to_sents(review, tokenizer)

print("Parsing sentences from unlabeled set")

for review in unlabeled_train["review"]:
    sentences += review_to_sents(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [32]:
len(sentences)

795538

In [33]:
sentences[1]

['maybe',
 'i',
 'just',
 'want',
 'to',
 'get',
 'a',
 'certain',
 'insight',
 'into',
 'this',
 'guy',
 'who',
 'i',
 'thought',
 'was',
 'really',
 'cool',
 'in',
 'the',
 'eighties',
 'just',
 'to',
 'maybe',
 'make',
 'up',
 'my',
 'mind',
 'whether',
 'he',
 'is',
 'guilty',
 'or',
 'innocent']

## 학습

In [36]:
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s',
    level=logging.INFO)

num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words

In [37]:
print("Training model...")

model = word2vec.Word2Vec(sentences, 
                          workers = num_workers,
                         size = num_features,
                         min_count = min_word_count,
                         window = context,
                         sample = downsampling)

model.init_sims(replace=True)

model_name = "300features_40minwords_10context"
model.save(model_name)

2019-03-26 12:27:45,407 : INFO : collecting all words and their counts
2019-03-26 12:27:45,408 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-03-26 12:27:45,461 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2019-03-26 12:27:45,510 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2019-03-26 12:27:45,556 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


Training model...


2019-03-26 12:27:45,605 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2019-03-26 12:27:45,652 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2019-03-26 12:27:45,698 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2019-03-26 12:27:45,745 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2019-03-26 12:27:45,793 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2019-03-26 12:27:45,840 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2019-03-26 12:27:45,887 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2019-03-26 12:27:45,933 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2019-03-26 12:27:45,981 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

2019-03-26 12:27:49,115 : INFO : PROGRESS: at sentence #760000, processed 16990622 words, keeping 120930 word types
2019-03-26 12:27:49,166 : INFO : PROGRESS: at sentence #770000, processed 17217759 words, keeping 121703 word types
2019-03-26 12:27:49,218 : INFO : PROGRESS: at sentence #780000, processed 17447905 words, keeping 122402 word types
2019-03-26 12:27:49,270 : INFO : PROGRESS: at sentence #790000, processed 17674981 words, keeping 123066 word types
2019-03-26 12:27:49,298 : INFO : collected 123504 word types from a corpus of 17798082 raw words and 795538 sentences
2019-03-26 12:27:49,298 : INFO : Loading a fresh vocabulary
2019-03-26 12:27:49,371 : INFO : effective_min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2019-03-26 12:27:49,372 : INFO : effective_min_count=40 leaves 17238940 word corpus (96% of original 17798082, drops 559142)
2019-03-26 12:27:49,451 : INFO : deleting the raw counts dictionary of 123504 items
2019-03-26 12:27:49,455 : I

2019-03-26 12:28:40,010 : INFO : EPOCH 4 - PROGRESS: at 56.44% examples, 892876 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:41,015 : INFO : EPOCH 4 - PROGRESS: at 63.67% examples, 896487 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:42,015 : INFO : EPOCH 4 - PROGRESS: at 70.82% examples, 898431 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:43,018 : INFO : EPOCH 4 - PROGRESS: at 78.00% examples, 899734 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:44,024 : INFO : EPOCH 4 - PROGRESS: at 85.28% examples, 901771 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:45,024 : INFO : EPOCH 4 - PROGRESS: at 91.98% examples, 898420 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:46,030 : INFO : EPOCH 4 - PROGRESS: at 99.16% examples, 899272 words/s, in_qsize 7, out_qsize 0
2019-03-26 12:28:46,129 : INFO : worker thread finished; awaiting finish of 3 more threads
2019-03-26 12:28:46,130 : INFO : worker thread finished; awaiting finish of 2 more threads
2019-03-26 12:28:46,132 : I

## 결과

### 문장에 어울리지 않는 단어 찾기

In [39]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'kitchen'

In [40]:
model.doesnt_match("france england germany berlin".split())

  """Entry point for launching an IPython kernel.


'berlin'

### 지정 단어에 가장 근접한 단어들 찾기

In [41]:
model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6287728548049927),
 ('lady', 0.5718315839767456),
 ('lad', 0.5706989765167236),
 ('farmer', 0.5474367737770081),
 ('men', 0.5279064178466797),
 ('chap', 0.5121819972991943),
 ('monk', 0.5119988918304443),
 ('politician', 0.5100113749504089),
 ('guy', 0.5092923045158386),
 ('person', 0.5090522766113281)]

In [42]:
model.most_similar("queen")

  """Entry point for launching an IPython kernel.


[('princess', 0.6503031253814697),
 ('bride', 0.616742730140686),
 ('goddess', 0.6125830411911011),
 ('victoria', 0.6076536178588867),
 ('belle', 0.5995310544967651),
 ('marlene', 0.579119086265564),
 ('latifah', 0.5784250497817993),
 ('maid', 0.5770208239555359),
 ('mistress', 0.5641007423400879),
 ('showgirl', 0.5595673322677612)]

In [44]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7574539184570312),
 ('horrible', 0.7391939759254456),
 ('dreadful', 0.7299064993858337),
 ('atrocious', 0.7201664447784424),
 ('abysmal', 0.7008265256881714),
 ('appalling', 0.6705971956253052),
 ('horrid', 0.6693190336227417),
 ('horrendous', 0.6669641137123108),
 ('lousy', 0.6343021392822266),
 ('laughable', 0.6074416637420654)]