In [1]:
import numpy as np
import pandas as pd

DATA_IN_PATH='./data_in/'
TRAIN_CLEAN_DATA='train_clean.csv'

train_data = pd.read_csv(DATA_IN_PATH+TRAIN_CLEAN_DATA)

reviews = train_data['review']
sentiments = train_data['sentiment']
sentences =[review.split() for review in reviews ]

In [2]:
num_features=300# word vector 특징값의 수
min_word_count=40
num_workers=4
context=10
downsampling=1e-3

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

In [4]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences,workers=num_workers,\
                          size=num_features,\
                          min_count=min_word_count,\
                          window=context,\
                          sample=downsampling)

2021-05-21 19:27:32,788 : INFO : collecting all words and their counts
2021-05-21 19:27:32,790 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-05-21 19:27:32,997 : INFO : PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2021-05-21 19:27:33,210 : INFO : PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2021-05-21 19:27:33,315 : INFO : collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2021-05-21 19:27:33,315 : INFO : Loading a fresh vocabulary
2021-05-21 19:27:33,362 : INFO : effective_min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2021-05-21 19:27:33,363 : INFO : effective_min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2021-05-21 19:27:33,392 : INFO : deleting the raw counts dictionary of 74065 items
2021-05-21 19:27:33,395 : INFO : sample=0.001 downsamples 30 most-common words
2021-05-21 19:27:33,396 :

In [5]:
model_name="300features_40minwords_10context"
model.save(model_name)

2021-05-21 19:28:00,791 : INFO : saving Word2Vec object under 300features_40minwords_10context, separately None
2021-05-21 19:28:00,794 : INFO : not storing attribute vectors_norm
2021-05-21 19:28:00,795 : INFO : not storing attribute cum_table
2021-05-21 19:28:00,957 : INFO : saved 300features_40minwords_10context


In [6]:
def get_features(words,model,num_features):
    feature_vector = np.zeros((num_features),dtype=np.float32)   
    num_words = 0
    
    # 어휘사전
    index2word_set=set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words+=1
            feature_vector = np.add(feature_vector,model.wv[w])
    
    feature_vector = np.divide(feature_vector,num_words)
    return feature_vector


In [7]:
def get_dataset(reviews,model,num_features):
    dataset =[ get_features(s,model,num_features) for s in reviews ]
    reviewFeatureVecs = np.stack(dataset)
    return reviewFeatureVecs

In [8]:
test_data_vecs = get_dataset(sentences,model,num_features)

In [9]:
from sklearn.model_selection import train_test_split

X=test_data_vecs
y=np.array(sentiments)

RANDOM_SEED=100
TEST_SPLIT=0.2


X_train,X_eval, y_train,y_eval = train_test_split(X,y,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

In [10]:
from sklearn.linear_model import LogisticRegression
lgs=LogisticRegression(class_weight='balanced')
lgs.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight='balanced')

## Evaluation

In [11]:
print("Accuracy : {}".format(lgs.score(X_eval,y_eval)))

Accuracy : 0.8592


In [12]:
TEST_CLEAN_DATA='test_clean.csv'

test_data=pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA)
test_review = list(test_data['review'])

In [13]:
test_sentences =[review.split() for review in test_review]

In [14]:
test_data_vecs = get_dataset(test_sentences,model,num_features)

## Prediction

In [15]:
import os
DATA_OUT_PATH='./data_out/'

test_predicted = lgs.predict(test_data_vecs)

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

In [16]:
test_data['id']=test_data['id'].apply(lambda x : x.replace('"',''))

In [17]:
ids=list(test_data['id'])

In [18]:
answer_dataset=pd.DataFrame({'id':ids,'sentiment':test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH+'lgs_answer.csv',index=False)