In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ln -s /content/drive/My\ Drive/ /my\drive

ln: failed to create symbolic link '/mydrive/My Drive': Operation not supported


In [3]:
import os
import re

import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

데이터 전처리

In [4]:
RANDOM_SEED = 42
TEST_SPLIT = 0.2

In [5]:
train_data = pd.read_csv('/content/drive/My Drive/딥러닝 실습/data_in/train_clean.csv')
train_data.head()

Unnamed: 0,review,sentiment
0,stuff going moment mj started listening music ...,1
1,classic war worlds timothy hines entertaining ...,1
2,film starts manager nicholas bell giving welco...,0
3,must assumed praised film greatest filmed oper...,0
4,superbly trashy wondrously unpretentious explo...,1


In [6]:
reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [7]:
sentences = []
for review in reviews:
  sentences.append(review.split()) #split 함수를 사용해서 띄어쓰기 기준으로 구분

Word2Vec 모델 학습 진행

In [8]:
num_features = 300 #각 단어에 대해 임베딩된 벡터의 차원을 정한다.
min_word_count = 10 #9번 이하로 나온 단어들은 학습하지 않는다.
num_workers = 4 #모델 학습 시 학습을 위한 프로세스 개수를 지정한다. 
context = 10 #Window크기
downsampling = 1e-3 #빠른 학습을 위해 정답 단어 라벨에 대한 다운샘플링 비율을 지정. 보통 0.001이 좋은 성능이다. 

In [9]:
import logging #word2vec 학습의 과정에서 진행 상황을 확인한다. 
logging.basicConfig(format ='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)

In [10]:
!pip install gensim



In [11]:
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)

2020-07-13 06:23:32,456: INFO: 'pattern' package not found; tag filters are not available for English
2020-07-13 06:23:32,461: INFO: collecting all words and their counts
2020-07-13 06:23:32,462: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-07-13 06:23:32,762: INFO: PROGRESS: at sentence #10000, processed 1205223 words, keeping 51374 word types
2020-07-13 06:23:33,068: INFO: PROGRESS: at sentence #20000, processed 2396605 words, keeping 67660 word types
2020-07-13 06:23:33,224: INFO: collected 74065 word types from a corpus of 2988089 raw words and 25000 sentences
2020-07-13 06:23:33,225: INFO: Loading a fresh vocabulary
2020-07-13 06:23:33,285: INFO: effective_min_count=10 retains 19717 unique words (26% of original 74065, drops 54348)
2020-07-13 06:23:33,286: INFO: effective_min_count=10 leaves 2854211 word corpus (95% of original 2988089, drops 133878)
2020-07-13 06:23:33,351: INFO: deleting the raw counts dictionary of 74065 items
2020-07-13 06:23:33

In [12]:
def get_features(words, model, num_features):
  feature_vector = np.zeros((num_features), dtype=np.float32)

  num_words = 0
  index2word_set = set(model.wv.index2word)

  for w in words:
     if w in index2word_set:
       num_words +=1
       feature_vector = np.add(feature_vector, model[w])
  
  feature_vector = np.divide(feature_vector, num_words)
  return feature_vector

In [13]:
def get_dataset(reviews, model, num_features):
  dataset = list()

  for s in reviews:
    dataset.append(get_features(s,model,num_features))

  reviewFeatureVecs = np.stack(dataset)

  return reviewFeatureVecs

In [14]:
import sys
python_version = sys.version_info[0]

In [15]:
# If you're running on Colab, you'll need to install the What-if Tool package and authenticate
# If you're on Cloud AI Platform Notebooks, you'll need to install XGBoost on the TF instance
def pip_install(module):
    if python_version == '2':
        !pip install {module} --quiet
    else:
        !pip3 install {module} --quiet

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    pip_install('witwidget')

    from google.colab import auth
    auth.authenticate_user()
else:
    pip_install('xgboost')

In [18]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators = 100, n_jobs = -1, random_state = 2018)

X= get_dataset(sentences, model, num_features)
y= np.array(sentiments)


%time xgb = xgb.fit(X,y)
%time score = np.mean(cross_val_score(xgb, X, y, cv=10, scoring='roc_auc',verbose=5))

score

  # Remove the CWD from sys.path while we load stuff.


CPU times: user 1min 22s, sys: 130 ms, total: 1min 22s
Wall time: 41.7 s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] .................................... , score=0.920, total=  37.8s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   37.8s remaining:    0.0s


[CV] .................................... , score=0.921, total=  39.7s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.3min remaining:    0.0s


[CV] .................................... , score=0.930, total=  38.5s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min remaining:    0.0s


[CV] .................................... , score=0.916, total=  38.6s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.6min remaining:    0.0s


[CV] .................................... , score=0.919, total=  38.4s
[CV]  ................................................................
[CV] .................................... , score=0.915, total=  38.4s
[CV]  ................................................................
[CV] .................................... , score=0.917, total=  38.7s
[CV]  ................................................................
[CV] .................................... , score=0.929, total=  39.4s
[CV]  ................................................................
[CV] .................................... , score=0.907, total=  38.5s
[CV]  ................................................................
[CV] .................................... , score=0.917, total=  38.6s
CPU times: user 12min 41s, sys: 420 ms, total: 12min 42s
Wall time: 6min 26s


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.4min finished


0.9189764479999999

In [19]:
test_data = pd.read_csv('/content/drive/My Drive/딥러닝 실습/data_in/test_clean.csv')

test_review = list(test_data['review'])

In [20]:
test_sentences = list()
for review in test_review:
  test_sentences.append(review.split())

In [22]:
test_data_vecs1 = get_dataset(test_sentences, model, num_features) #X_test

  # Remove the CWD from sys.path while we load stuff.


In [23]:
test_predicted = xgb.predict(test_data_vecs1)

In [24]:
ids = list(test_data['id'])

In [25]:
answer_dataset = pd.DataFrame({'id': ids, 'sentiment': test_predicted})

In [26]:
if not os.path.exists('/content/drive/My Drive/딥러닝 실습/data_out'):
  os.makedirs('/content/drive/My Drive/딥러닝 실습/data_out')

answer_dataset.to_csv('/content/drive/My Drive/딥러닝 실습/data_out' + 'xgb_w2v_answer.csv', index=False, quoting=3)