## 4.1.3 Linear Regression Example with Word2Vec

### Word2Vec Feature Example

In [1]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
DATA_IN_PATH = '../dataset/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [5]:
train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

In [7]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [12]:
sentences = []
for review in reviews:
    sentences.append(word_tokenize(review))

In [14]:
sentences[0]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary',
 'watched',
 'wiz',
 'watched',
 'moonwalker',
 'maybe',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'thought',
 'really',
 'cool',
 'eighties',
 'maybe',
 'make',
 'mind',
 'whether',
 'guilty',
 'innocent',
 'moonwalker',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'remember',
 'going',
 'see',
 'cinema',
 'originally',
 'released',
 'subtle',
 'messages',
 'mj',
 'feeling',
 'towards',
 'press',
 'also',
 'obvious',
 'message',
 'drugs',
 'bad',
 'kay',
 'visually',
 'impressive',
 'course',
 'michael',
 'jackson',
 'unless',
 'remotely',
 'like',
 'mj',
 'anyway',
 'going',
 'hate',
 'find',
 'boring',
 'may',
 'call',
 'mj',
 'egotist',
 'consenting',
 'making',
 'movie',
 'mj',
 'fans',
 'would',
 'say',
 'made',
 'fans',
 'true',
 'really',
 'nice',
 'actual',
 'feature',
 'film',
 'bit',
 'finally',
 'starts',
 'minutes',
 'excluding',
 'smooth',
 'crim

In [10]:
# 하이퍼 파라미터
num_features = 300 # 워드 벡터 특징값 수  (임베딩 벡터 차원) 
min_word_count = 40 # 단어에 대한 최소 빈도 수 
num_workers = 4 # 학습을 위한 프로세스 개수      
context = 10 # 컨텍스트 윈도우 크기          
downsampling = 1e-3 # 빠른 학습을 위해 정답 단어 레이블에 대한 다운샘플링 비율(0.001이 좋음)

In [15]:
# 학습 진행사항 확인
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
   level=logging.INFO)

In [17]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, 
                          workers=num_workers,
                         size=num_features,
                         min_count=min_word_count,
                         window=context,
                         sample=downsampling)

2021-01-08 12:52:44,285 : INFO : collecting all words and their counts
2021-01-08 12:52:44,286 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-08 12:52:44,547 : INFO : PROGRESS: at sentence #10000, processed 1205900 words, keeping 51374 word types
2021-01-08 12:52:44,828 : INFO : PROGRESS: at sentence #20000, processed 2397955 words, keeping 67660 word types
2021-01-08 12:52:44,974 : INFO : collected 74064 word types from a corpus of 2989726 raw words and 25000 sentences
2021-01-08 12:52:44,975 : INFO : Loading a fresh vocabulary
2021-01-08 12:52:45,022 : INFO : effective_min_count=40 retains 8161 unique words (11% of original 74064, drops 65903)
2021-01-08 12:52:45,023 : INFO : effective_min_count=40 leaves 2628958 word corpus (87% of original 2989726, drops 360768)
2021-01-08 12:52:45,050 : INFO : deleting the raw counts dictionary of 74064 items
2021-01-08 12:52:45,053 : INFO : sample=0.001 downsamples 30 most-common words
2021-01-08 12:52:45,054 :

In [18]:
# 모델의 하이퍼 파라미터를 설정한 내용을 모델 이름에 담는다면 나중에 참고하기 좋을 것
# 모델을 저장하면 Word2Vec.load()를 통해 다시 사용할 수 있음
model_name = '../snapshot/300features_40minwords_10context'
model.save(model_name)

2021-01-08 12:56:31,259 : INFO : saving Word2Vec object under ../snapshot/300features_40minwords_10context, separately None
2021-01-08 12:56:31,261 : INFO : not storing attribute vectors_norm
2021-01-08 12:56:31,262 : INFO : not storing attribute cum_table
2021-01-08 12:56:31,496 : INFO : saved ../snapshot/300features_40minwords_10context


In [19]:
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    index2word_set = set(model.wv.index2word) # 단어 사전
    
    for w in words:
        if w in index2word_set:
            num_words+=1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함 
            feature_vector = np.add(feature_vector, model[w])
    
    # 문장의 단어 수만큼 나누어 단어 벡터의 평균을 취함
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [22]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
    
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [23]:
test_data_vecs = get_dataset(sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.


In [37]:
model['sentences']

  """Entry point for launching an IPython kernel.


array([-0.02640619, -0.14458393, -0.09509004, -0.08096875,  0.04741286,
       -0.03741783,  0.0219393 ,  0.03156377,  0.08235965, -0.06761346,
       -0.03597539, -0.01125347, -0.15502955, -0.06209488, -0.18695156,
        0.03714039,  0.08632327, -0.01063505,  0.03024327, -0.05743328,
        0.1429146 , -0.15087967,  0.05187488,  0.0617708 ,  0.159874  ,
       -0.10153214, -0.01214275,  0.08777526, -0.15193017, -0.01255662,
        0.0801809 , -0.10454084,  0.0387894 , -0.17809501, -0.17801481,
       -0.14112939,  0.01613287,  0.02805205,  0.0159293 ,  0.03151309,
        0.01988512, -0.02880661,  0.10957897,  0.09595552,  0.26332775,
        0.13695167,  0.00702472, -0.00435817,  0.05849722,  0.02389651,
        0.05995859,  0.054804  ,  0.0435122 ,  0.00068778, -0.11518065,
        0.0794315 ,  0.05362769, -0.24948503, -0.04995171, -0.1138126 ,
       -0.03293214,  0.1949825 ,  0.10471201,  0.01226235, -0.24788082,
        0.13863872, -0.20625623, -0.05579423,  0.23053773, -0.10

In [33]:
test_data_vecs # 실제 학습에 사용될 입력값 (임베딩)

array([[-0.07590105,  0.12482008, -0.14356229, ..., -0.0393403 ,
         0.29451394, -0.05616223],
       [-0.2558206 ,  0.05033305,  0.3669831 , ..., -0.16925296,
         0.38734385, -0.14674819],
       [-0.42619082,  0.2628543 ,  0.4130558 , ..., -0.15671903,
         0.2993253 , -0.06611255],
       ...,
       [-0.1997599 ,  0.12621206,  0.251406  , ..., -0.01236875,
         0.04296463, -0.15083946],
       [ 0.04327991,  0.17174765, -0.4576193 , ..., -0.04074266,
         0.09678981, -0.09552169],
       [-0.19706714, -0.08549628,  0.16449447, ..., -0.11568981,
         0.11454067, -0.03168364]], dtype=float32)

In [24]:
# 학습과 검증 데이터셋 분리
from sklearn.model_selection import train_test_split
import numpy as np

X = test_data_vecs
y = np.array(sentiments)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [25]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(class_weight='balanced')

In [26]:
print("Accuracy: %f" % lgs.score(X_test, y_test)) 

Accuracy: 0.862000


In [28]:
TEST_CLEAN_DATA = 'test_clean.csv'

test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)

test_review = list(test_data['review'])

In [29]:
test_data.head(5)

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,"""12311_10"""
1,movie disaster within disaster film full great...,"""8348_2"""
2,movie kids saw tonight child loved one point k...,"""5828_4"""
3,afraid dark left impression several different ...,"""7186_2"""
4,accurate depiction small time mob life filmed ...,"""12128_7"""


In [30]:
test_sentences = list()
for review in test_review:
    test_sentences.append(review.split())

In [31]:
test_data_vecs = get_dataset(test_sentences, model, num_features)

  # Remove the CWD from sys.path while we load stuff.


In [32]:
DATA_OUT_PATH = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id': ids, 'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv', index=False, quoting=3)

In [None]:
model_name = "300features_40minwords_10context"
model.save(model_name)