In [8]:
import pandas as pd
import numpy as np

## 전처리

In [2]:
data_in_path = './data_in/'
train_clean_data = 'train_clean.csv'

train_data = pd.read_csv(data_in_path + train_clean_data)
train_data.head()

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1


In [3]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

sentences = []
for review in reviews:
    sentences.append(review.split())

sentences[0]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary',
 'watched',
 'wiz',
 'watched',
 'moonwalker',
 'maybe',
 'want',
 'get',
 'certain',
 'insight',
 'guy',
 'thought',
 'really',
 'cool',
 'eighties',
 'maybe',
 'make',
 'mind',
 'whether',
 'guilty',
 'innocent',
 'moonwalker',
 'part',
 'biography',
 'part',
 'feature',
 'film',
 'remember',
 'going',
 'see',
 'cinema',
 'originally',
 'released',
 'subtle',
 'messages',
 'mj',
 'feeling',
 'towards',
 'press',
 'also',
 'obvious',
 'message',
 'drugs',
 'bad',
 'kay',
 'visually',
 'impressive',
 'course',
 'michael',
 'jackson',
 'unless',
 'remotely',
 'like',
 'mj',
 'anyway',
 'going',
 'hate',
 'find',
 'boring',
 'may',
 'call',
 'mj',
 'egotist',
 'consenting',
 'making',
 'movie',
 'mj',
 'fans',
 'would',
 'say',
 'made',
 'fans',
 'true',
 'really',
 'nice',
 'actual',
 'feature',
 'film',
 'bit',
 'finally',
 'starts',
 'minutes',
 'excluding',
 'smooth',
 'crim

## word2vec 벡터화

In [4]:
# 학습 시 필요한 하이퍼파라미터
num_features = 300         # 워드 벡터 특징값 수
min_word_count = 40        # 단어에 대한 최소 빈도 수
num_workers = 4            # 프로세스 개수
context = 10               # 컨텍스트 윈도우 크기
downsampling = 1e-3        # 다운샘플링 비율

In [5]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

In [6]:
from gensim.models import word2vec
print('Training model...')
model = word2vec.Word2Vec(sentences,
                         workers=num_workers,
                         size=num_features,
                         min_count=min_word_count,
                         window=context,
                         sample=downsampling)

2020-01-21 16:43:21,189: INFO: collecting all words and their counts
2020-01-21 16:43:21,192: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2020-01-21 16:43:21,398: INFO: PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2020-01-21 16:43:21,598: INFO: PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2020-01-21 16:43:21,705: INFO: collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2020-01-21 16:43:21,706: INFO: Loading a fresh vocabulary
2020-01-21 16:43:21,741: INFO: effective_min_count=40 retains 8160 unique words (11% of original 74065, drops 65905)
2020-01-21 16:43:21,742: INFO: effective_min_count=40 leaves 2627273 word corpus (87% of original 2988089, drops 360816)
2020-01-21 16:43:21,768: INFO: deleting the raw counts dictionary of 74065 items
2020-01-21 16:43:21,769: INFO: sample=0.001 downsamples 30 most-common words
2020-01-21 16:43:21,770: INFO: downsampling leaves estimated 2494384 word corpus (94.9% of prior 2627273)
2020-01-21 16:43:21,791: INFO: estimated required memory for 8160 words and 300 dimensions: 23664000 byte

In [7]:
model_name = '300features_40minwords_10context'
model.save(model_name)

2020-01-21 16:44:47,258: INFO: saving Word2Vec object under 300features_40minwords_10context, separately None
2020-01-21 16:44:47,260: INFO: not storing attribute vectors_norm
2020-01-21 16:44:47,260: INFO: not storing attribute cum_table
2020-01-21 16:44:47,436: INFO: saved 300features_40minwords_10context


In [12]:
def get_features(words, model, num_features):
    # 출력 벡터 초기화
    feature_vector = np.zeros((num_features), dtype=np.float32)
    
    num_words = 0
    # 어휘사전 준비
    index2word_set = set(model.wv.index2word)
    
    for w in words:
        if w in index2word_set:
            num_words += 1
            # 사전에 해당하는 단어에 대해 단어 벡터를 더함
            feature_vector = np.add(feature_vector, model[w])
    
    # 문장의 단어 수만큼 나누어 단어 벡터의 평균값을 문장 벡터로 함
    feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [13]:
def get_dataset(reviews, model, num_features):
    dataset = list()
    
    for s in reviews:
        dataset.append(get_features(s, model, num_features))
        
    reviewFeatureVecs = np.stack(dataset)
    
    return reviewFeatureVecs

In [14]:
test_data_vecs = get_dataset(sentences, model, num_features)

  del sys.path[0]


## 학습과 검증 데이터셋 분리

In [16]:
from sklearn.model_selection import train_test_split

X = test_data_vecs
y = np.array(sentiments)

random_seed = 42
test_split = 0.2

X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=test_split, random_state=random_seed)
print(X_train.shape, y_train.shape)
print(X_eval.shape, y_eval.shape)

(20000, 300) (20000,)
(5000, 300) (5000,)


## 모델 선언 및 학습

In [17]:
from sklearn.linear_model import LogisticRegression

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## 검증 데이터셋을 이용한 성능 평가

In [18]:
print('Accuracy:', lgs.score(X_eval, y_eval))

Accuracy: 0.8644


## 데이터 제출

In [19]:
test_clean_data = 'test_clean.csv'

test_data = pd.read_csv(data_in_path + test_clean_data, header=0)
test_data.head()

Unnamed: 0,review,id
0,naturally film main themes mortality nostalgia...,12311_10
1,movie disaster within disaster film full great...,8348_2
2,movie kids saw tonight child loved one point k...,5828_4
3,afraid dark left impression several different ...,7186_2
4,accurate depiction small time mob life filmed ...,12128_7


In [20]:
test_review = list(test_data['review'])

test_sentences = []
for review in test_review:
    test_sentences.append(review.split())

In [21]:
test_data_vecs = get_dataset(test_sentences, model, num_features)

  del sys.path[0]


In [22]:
import os
data_out_path = './data_out/'

test_predicted = lgs.predict(test_data_vecs)

if not os.path.exists(data_out_path):
    os.makedirs(data_out_path)
    
ids = list(test_data['id'])
answer_dataset = pd.DataFrame({
    'id': ids,
    'sentiment': test_predicted
})
answer_dataset.to_csv(data_out_path + 'lgs_answer.csv', index=False)