In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from bs4 import BeautifulSoup
import re
import time

from nltk.corpus import stopwords
import nltk.data

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('labeledTrainData.tsv', header=0, sep='\t', quoting=3)
test = pd.read_csv('testData.tsv', header=0, sep='\t', quoting=3)
unlabeled_train = pd.read_csv('unlabeledTrainData.tsv', header=0, sep='\t', quoting=3)

print(train.shape)
print(test.shape)
print(unlabeled_train.shape)

(25000, 3)
(25000, 2)
(50000, 2)


In [8]:
model = Word2Vec.load("300features_40minwords_10context")

In [9]:
model.wv.load_word2vec_format

<bound method Word2VecKeyedVectors.load_word2vec_format of <class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>>

In [10]:
model.wv.index2word == model.wv.index2entity

True

In [11]:
model['doctor'].shape

  """Entry point for launching an IPython kernel.


(300,)

## Review 의 평균 벡터 구하기


In [12]:
# word 로 구성된 리스트를 넣었을 때, word 의 vector 평균을 구하는 함수 (입력값: 하나의 sentence)
def makeFeatureVec(words, model, num_features):
    featureVec = np.zeros((num_features,), dtype='float32')
    nwords = 0
    vector_words = set(model.wv.index2word)
    for word in words:
        if word in vector_words:
            nwords = nwords+1
            featureVec = np.add(featureVec, model[word])
    featureVec = np.divide(featureVec, nwords)
    return featureVec


In [18]:
# sentence 로 구성된 리스트를 넣었을 때, sentence 의 벡터 평균을 구하는 함수
def reviewFeatureVec(reviews, model, num_features):
    reviewFeatureVec = np.zeros((len(reviews),num_features))
    counter = 0
    for review in reviews:
        if counter%1000==0:
            print("Review %d of %d" % (counter, len(reviews)))
        reviewFeatureVec[int(counter)] =  makeFeatureVec(review, model, num_features)
        counter = counter+1
    return reviewFeatureVec

In [14]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

In [15]:
from kaggle_w2v import KaggleWord2VecUtility

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 멀티스레드로 4개의 워커를 사용해 처리한다.
def getCleanReviews(reviews):
    clean_reviews = []
    clean_reviews = KaggleWord2VecUtility.apply_by_multiprocessing(\
        reviews["review"], KaggleWord2VecUtility.review_to_wordlist,\
        workers=4)
    return clean_reviews

In [17]:
%time trainDataVecs = reviewFeatureVec(getCleanReviews(train), model, num_features )

Review 0 of 25000


  if __name__ == '__main__':


Review 100 of 25000
Review 200 of 25000
Review 300 of 25000
Review 400 of 25000
Review 500 of 25000
Review 600 of 25000
Review 700 of 25000
Review 800 of 25000
Review 900 of 25000
Review 1000 of 25000
Review 1100 of 25000
Review 1200 of 25000
Review 1300 of 25000
Review 1400 of 25000
Review 1500 of 25000
Review 1600 of 25000
Review 1700 of 25000
Review 1800 of 25000
Review 1900 of 25000
Review 2000 of 25000
Review 2100 of 25000
Review 2200 of 25000
Review 2300 of 25000
Review 2400 of 25000
Review 2500 of 25000
Review 2600 of 25000
Review 2700 of 25000
Review 2800 of 25000
Review 2900 of 25000
Review 3000 of 25000
Review 3100 of 25000
Review 3200 of 25000
Review 3300 of 25000
Review 3400 of 25000
Review 3500 of 25000
Review 3600 of 25000
Review 3700 of 25000
Review 3800 of 25000
Review 3900 of 25000
Review 4000 of 25000
Review 4100 of 25000
Review 4200 of 25000
Review 4300 of 25000
Review 4400 of 25000
Review 4500 of 25000
Review 4600 of 25000
Review 4700 of 25000
Review 4800 of 25000
R

In [20]:
%time testDataVecs = reviewFeatureVec(getCleanReviews(test), model, num_features )

Review 0 of 25000


  if __name__ == '__main__':


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
CPU times: user 1min 23s, sys: 1.57 s, total: 1min 24s
Wall time: 2min 28s


## RandomForest

In [22]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)

forest = forest.fit(trainDataVecs, train['sentiment'])

In [23]:
result = forest.predict(testDataVecs)

In [28]:
output = pd.DataFrame(data = {'id':test['id'], 'sentiment':result})
output.to_csv('part3-FeatureVecAvg.csv', header=0, quoting=3)

## Score: 0.78072

# Attempt 2: Clustering

방법 설명: 
Word2Vec 으로 구한 단어 벡터를 바탕으로 클러스터를 구성한 후, 각 Review 의 단어가 해당하는 클러스터 개수를 세어서 BOW 모델로 Feature 를 만든다. 만들어진 Feature 와 Training Data 의 Sentiment 로 Supervised Learning 한다. 

따라서 진행하는 순서는 아래와 같다.
1. KMeans 모델을 사용하여 word vector 클러스터 학습. 평균적으로 한 클러스터당 5개의 단어가 오도록 한다.
2. 단어와 단어가 해당하는 클러스터 idx 의 정보를 가지고 있는 dictionary 생성
3. training data 의 각 review 별로 구성하는 단어의 클러스터 idx 의 BOW 벡터 생성
4. 클러스터의 BOW 와 training data 의 sentiment 로 랜덤포레스트 모델 학습
5. 4번에서 생성한 모델로 predict 

In [39]:
from sklearn.cluster import KMeans

num_clusters = int(len(model.wv.index2word)/5)
kmeans = KMeans(n_clusters = num_clusters)

In [40]:
word_vectors = model.wv.syn0

start = time.time()
idx = kmeans.fit_predict(word_vectors)
end = time.time()
elapsed = end-start
print('Time taken for KMeans Clustering: ', elapsed)

  """Entry point for launching an IPython kernel.


Time taken for KMeans Clustering:  595.7671039104462


In [42]:
# 2. 단어와 단어가 해당하는 클러스터 idx 의 정보를 가지고 있는 dictionary 생성
idx = list(idx)
words_lst = model.wv.index2word
word_centroid_map = {words_lst[i]:idx[i] for i in range(len(words_lst))}

In [44]:
# 클러스터 확인
for no_cluster in range(0,10):
    print('Cluser number: {}'.format(no_cluster))
    words = []
    for i in range(0, len(list(word_centroid_map.values()))):
        if (list(word_centroid_map.values())[i]==no_cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)
    

Cluser number: 0
['prejudice', 'rebellion', 'equality', 'discrimination', 'pretext']
Cluser number: 1
['noticeably']
Cluser number: 2
['sinatra', 'warren', 'beatty', 'oates', 'mchugh']
Cluser number: 3
['haunted', 'owned', 'visited', 'spotted']
Cluser number: 4
['hapless', 'pesky']
Cluser number: 5
['hare', 'mal', 'venom', 'sabu', 'veidt']
Cluser number: 6
['roman', 'frontier', 'colonial', 'geisha', 'crusades', 'royalty', 'carandiru', 'geography', 'predominantly', 'postwar', 'continental', 'mayan', 'heartland', 'province', 'warsaw']
Cluser number: 7
['implications', 'contents']
Cluser number: 8
['whistle', 'hut', 'peach']
Cluser number: 9
['break', 'fly', 'block', 'sink', 'lift', 'climb', 'ladder', 'crawl', 'penetrate']


In [45]:
# 3. training data 의 각 review 별로 구성하는 단어의 클러스터 idx 의 BOW 벡터 생성

# BOW 를 계산하도록 각 review 를 단어의 리스트로 만든다
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True))

clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, remove_stopwords=True))

In [46]:
# 단어로 구성된 리스트를 넣으면 단어가 해당된 클러스터 기준으로 BOW 결과를 출력하는 함수
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values())+1
    bag_of_centroids = np.zeros(num_centroids, dtype='float32')
    
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    return bag_of_centroids

In [48]:
train_centroids = np.zeros((train['review'].size, num_clusters))
test_centroids = np.zeros((test['review'].size, num_clusters))

In [50]:
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

    
counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

In [52]:
# 4. 클러스터의 BOW 와 training data 의 sentiment 로 랜덤포레스트 모델 학습

cluster_forest = RandomForestClassifier(n_estimators=100)
cluster_forest.fit(train_centroids, train['sentiment'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [53]:
# 5. 4번에서 생성한 모델로 predict 
cluster_result = cluster_forest.predict(test_centroids)
cluster_output = pd.DataFrame(data= {'id': test['id'], 'sentiment': cluster_result})

In [57]:
output.shape

(25000, 2)

In [58]:
cluster_output.to_csv('part3_ClusteringBOW.csv', index=False, quoting=3)

## Score: 0.81380