In [1]:
import os
import re
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
DATA_IN_PATH = './data_in/'
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []

for review in reviews:
    sentences.append(review.split())
    
num_features = 300  # 워드 벡터 특징값 수
min_word_count = 40 # 단어에 대한 최소 빈도 수
num_workers = 4     # 프로세스 개수
context = 10        # 컨텍스트 윈도 크기
downsampling = 1e-3 # 다운샘플링 비율

## word2vec 모델의 하이퍼파라미터
- num_features : 각 단어의 임베딩 벡터 차원
- min_word_count : 모델에 의미 있는 단어를 가지고 학습하기 위해 적은 빈도 수의 단어들은 학습하지 않는다.
- num_workers : 모델 학습 시 학습을 위한 프로세스 개수를 지정
- context : word2vec 수행하기 위한 컨텍스트 윈도 크기 지정
- downsampling : word2vec 학습을 수행할 때 빠른 학습을 위해 정답 단어 라벨에 대한 다운샘플링 비율을 지정한다. 보통 0.001이 좋은 성능을 낸다고 한다.

In [3]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    # 로그 보여줄 형식
                   level=logging.INFO) # 로그 수준

In [5]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, 
                          workers=num_workers,
                          vector_size=num_features,
                          min_count = min_word_count,
                          window=context,
                          sample=downsampling)

2023-04-08 13:41:05,730 : INFO : collecting all words and their counts
2023-04-08 13:41:05,739 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-04-08 13:41:05,961 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2023-04-08 13:41:06,186 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2023-04-08 13:41:06,305 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2023-04-08 13:41:06,306 : INFO : Creating a fresh vocabulary
2023-04-08 13:41:06,357 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.017349625329103%% of original 74065, drops 65905)', 'datetime': '2023-04-08T13:41:06.357318', 'gensim': '4.1.2', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'prepare_vocab'}
2023-04-08 13:41:06,358 : INFO : Word2Vec lifec

In [6]:
model

<gensim.models.word2vec.Word2Vec at 0x14c636a45c8>

In [21]:
def get_features(words, model, num_features):
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    index2word_set = set(model.wv.index_to_key) # 8160길이
    
    for w in words:
        if w in index2word_set:
            num_words += 1
            feature_vector = np.add(feature_vector, model.wv[w])
            
    feature_vector = np.divide(feature_vector, num_words)
    
    return feature_vector

def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

test_data_vecs = get_dataset(sentences, model, num_features)

In [13]:
for s in sentences:
    print(s)
    break

['stuff', 'going', 'moment', 'mj', 'started', 'listening', 'music', 'watching', 'odd', 'documentary', 'watched', 'wiz', 'watched', 'moonwalker', 'maybe', 'want', 'get', 'certain', 'insight', 'guy', 'thought', 'really', 'cool', 'eighties', 'maybe', 'make', 'mind', 'whether', 'guilty', 'innocent', 'moonwalker', 'part', 'biography', 'part', 'feature', 'film', 'remember', 'going', 'see', 'cinema', 'originally', 'released', 'subtle', 'messages', 'mj', 'feeling', 'towards', 'press', 'also', 'obvious', 'message', 'drugs', 'bad', 'kay', 'visually', 'impressive', 'course', 'michael', 'jackson', 'unless', 'remotely', 'like', 'mj', 'anyway', 'going', 'hate', 'find', 'boring', 'may', 'call', 'mj', 'egotist', 'consenting', 'making', 'movie', 'mj', 'fans', 'would', 'say', 'made', 'fans', 'true', 'really', 'nice', 'actual', 'feature', 'film', 'bit', 'finally', 'starts', 'minutes', 'excluding', 'smooth', 'criminal', 'sequence', 'joe', 'pesci', 'convincing', 'psychopathic', 'powerful', 'drug', 'lord', 

In [32]:
len(set(model.wv.index_to_key))

8160

In [18]:
set(model.wv.index_to_key)

{'namely',
 'fortunately',
 'sun',
 'muni',
 'transport',
 'blockbusters',
 'mood',
 'type',
 'equipment',
 'th',
 'solo',
 'transition',
 'funky',
 'fx',
 'pfeiffer',
 'feast',
 'elderly',
 'judy',
 'angles',
 'ended',
 'amazingly',
 'verhoeven',
 'convoluted',
 'capacity',
 'cooking',
 'taylor',
 'benefit',
 'steele',
 'turner',
 'north',
 'monty',
 'chain',
 'guide',
 'expecting',
 'rage',
 'prisoners',
 'match',
 'paz',
 'anil',
 'lighting',
 'friendly',
 'portrays',
 'discussion',
 'parking',
 'wisdom',
 'ludicrous',
 'tribute',
 'lend',
 'prostitutes',
 'tap',
 'boobs',
 'nutshell',
 'rejects',
 'hysterical',
 'fades',
 'lester',
 'signature',
 'reviewers',
 'producer',
 'hippie',
 'premise',
 'crashes',
 'uwe',
 'wow',
 'maker',
 'reminded',
 'cole',
 'estranged',
 'scares',
 'ten',
 'conclusions',
 'attack',
 'entrance',
 'guessed',
 'errol',
 'roughly',
 'stumble',
 'eager',
 'dealing',
 'devices',
 'curiously',
 'anthony',
 'voice',
 'haunting',
 'advance',
 'interests',
 'sp

In [19]:
# 단어 'stuff'의 word2vec 임베딩 결과 벡터 예시

model.wv['stuff']

array([ 4.99248832e-01, -3.63430679e-01, -1.11796208e-01, -2.10639611e-01,
       -3.92391384e-01,  1.05572367e+00,  6.31620968e-03, -5.81072569e-01,
       -7.35513389e-01,  1.17671025e+00,  4.03569818e-01,  9.35961068e-01,
       -5.84544301e-01, -2.96590596e-01,  4.66720253e-01, -7.94575274e-01,
        7.73075402e-01,  6.72431439e-02, -4.16245192e-01,  6.61186948e-02,
        1.38294053e+00,  3.99340242e-01,  8.05702284e-02, -5.28923810e-01,
       -1.50466397e-01, -3.74172539e-01,  1.82456821e-01,  1.58939720e-03,
       -7.93863952e-01, -3.48301381e-01, -2.81143218e-01,  6.85884476e-01,
        1.24131858e-01,  5.02883434e-01,  4.12299305e-01, -1.20862179e-01,
       -1.72999173e-01,  1.58614233e-01, -1.62101477e-01, -6.11127377e-01,
       -1.60749689e-01, -6.14646077e-01,  3.51361960e-01,  6.17551267e-01,
       -2.86332190e-01,  1.99311048e-01,  2.85046875e-01,  5.20428538e-01,
       -5.88844657e-01,  4.37829167e-01,  1.27428576e-01,  4.06116217e-01,
        8.69752586e-01, -

In [20]:
len(model.wv['stuff'])

300

In [22]:
test_data_vecs

array([[ 0.12239955,  0.15240708, -0.16109604, ...,  0.08871711,
        -0.16920629, -0.01497271],
       [ 0.09853074,  0.00749504, -0.17417702, ...,  0.21652001,
         0.13724329, -0.03892314],
       [-0.03684576,  0.1319762 , -0.0588348 , ..., -0.12680957,
        -0.01914953, -0.05520926],
       ...,
       [-0.01455216,  0.2316143 , -0.17487827, ...,  0.26816922,
         0.01325628,  0.03573281],
       [ 0.19930726,  0.16365069, -0.17181775, ...,  0.12439137,
         0.07807317,  0.00732926],
       [-0.01111315,  0.3054053 , -0.00737039, ..., -0.00944764,
         0.13471833, -0.06522714]], dtype=float32)

In [23]:
test_data_vecs.shape

(25000, 300)

In [24]:
from sklearn.model_selection import train_test_split
import numpy as np

x = test_data_vecs
y = np.array(sentiments)

x_train, x_test, y_train, y_test = train_test_split(
    x,y,test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [25]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(x_train, y_train)

print("Accuracy: %f" % lgs.score(x_test, y_test))

Accuracy: 0.866400


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [26]:
TEST_CLEAN_DATA = 'test_clean.csv'
test_data = pd.read_csv(DATA_IN_PATH + TEST_CLEAN_DATA)
test_review = list(test_data['review'])

test_sentences = list()
for review in test_review:
    test_sentences.append(review.split())
    
test_data_vecs = get_dataset(test_sentences, model, num_features)

In [27]:
test_data_vecs.shape

(25000, 300)

In [28]:
DATA_OUT_PATH = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

ids = list(test_data['id'])
answer_dataset = pd.DataFrame({'id':ids, 'sentiment':test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH + 'lgs_w2v_answer.csv',
                     index=False, quoting=3)

model_name = "300features_40minwords_10context"
model.save(model_name)

2023-04-08 14:04:47,429 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-08T14:04:47.429438', 'gensim': '4.1.2', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'saving'}
2023-04-08 14:04:47,430 : INFO : not storing attribute cum_table
2023-04-08 14:04:47,451 : INFO : saved 300features_40minwords_10context


모델 저장 시 모델 이름에 하이퍼파라미터 설정 내용 담기 ! 

Word2Vec.load()를 통해 나중에 다시 모델 사용 가능!

캐글 제출 --> 정확도 0.86104 

--> 난 책과 달리 tfidf (char 기준) 보다 0.01 높게 나옴 !

- word2vec이 항상 좋은 결과를 만들지는 않음 
    - 데이터가 더 많을 경우 word2vec 이 보통 더 좋은 결과