In [1]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
TRAIN_CLEAN_DATA = 'train_clean.csv'
RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [3]:
train_data = pd.read_csv(DATA_IN_PATH +TRAIN_CLEAN_DATA)

In [4]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [5]:
sentences = []
for review in reviews:
    sentences.append(review.split())

In [6]:
num_features = 300  # 단어에 대해 임베딩된 벡터의 차원
min_word_count = 40 # 적은 빈도수 배제
num_workers = 4     # 학습을 위한 프로세스 개수 지정
context = 10        # 컨텍스트 윈도우 크기 지정
downsampling = 1e-3 # 빠른 학습을 위해 정답 단어 레이블에 대한 다운샘플링
                    # 비율지정(보통 0.001이 좋은 성능을 낸다고 함)

In [7]:
import logging
# level = logging.INFO : word2vec의 학습과정에서 로그메시지를 양식에맞게 -
# - info 수준으로 보여줌
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)

In [8]:
from gensim.models import word2vec

model = word2vec.Word2Vec(sentences, workers=num_workers, vector_size=num_features, min_count=min_word_count,
                        window=context,sample=downsampling)

2022-11-07 17:17:29,354:INFO:collecting all words and their counts
2022-11-07 17:17:29,355:INFO:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-11-07 17:17:29,584:INFO:PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2022-11-07 17:17:29,831:INFO:PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2022-11-07 17:17:29,955:INFO:collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2022-11-07 17:17:29,955:INFO:Creating a fresh vocabulary
2022-11-07 17:17:30,016:INFO:Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 8160 unique words (11.02% of original 74065, drops 65905)', 'datetime': '2022-11-07T17:17:30.016690', 'gensim': '4.2.0', 'python': '3.10.4 | packaged by conda-forge | (main, Mar 30 2022, 08:38:02) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'prepare_vocab'}
2022-11-07 17:17:30,017:INFO:Word2Vec lifecycle event {'msg': 'e

In [9]:
'''
words : 단어의 모음인 하나의 리뷰
model : word2vec모델
num_features : word2vec로 임베딩할 때 정했던 벡터의 차원 수
'''
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features),dtype=np.float32)
    num_words = 0
    # 어휘 사전 준비
    index_to_key_set = set(model.wv.index_to_key)
    for w in words:
        if w in index_to_key_set:
            num_words += 1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model.wv[w])
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [12]:
def get_dataset(reviews,model,num_features):
    dataset=list()
    
    for s in reviews:
        dataset.append(get_features(s,model,num_features))
    
    reviewFeatureVecs=np.stack(dataset)

    return reviewFeatureVecs

In [16]:
test_data_vecs = get_dataset(sentences,model,num_features)

In [17]:
from sklearn.model_selection import train_test_split
import numpy as np
X = test_data_vecs
y = np.array(sentiments)
X_train, X_test, y_train, y_test = train_test_split(X, y, \
                            test_size=TEST_SPLIT, random_state=RANDOM_SEED)

In [18]:
from sklearn.linear_model import LogisticRegression
lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)
predicted = lgs.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
from sklearn import metrics
print("Accuracy: %f" % lgs.score(X_test, y_test))  #checking the accuracy
print("Precision: %f" % metrics.precision_score(y_test, predicted))
print("Recall: %f" % metrics.recall_score(y_test, predicted))
print("F1-Score: %f" % metrics.f1_score(y_test, predicted))

Accuracy: 0.866000
Precision: 0.859868
Recall: 0.876935
F1-Score: 0.868318
