# Word2Vec을 활용한 감성분석

## import

In [39]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import nltk.data
import logging
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
import time

## 데이터 로딩

In [31]:
train = pd.read_csv("./data/bow/labeledTrainData.tsv", 
                    sep="\t", quoting=3)
test = pd.read_csv("./data/bow/testData.tsv", 
                    sep="\t", quoting=3)
unlabeled_train = pd.read_csv("./data/bow/unlabeledTrainData.tsv", 
                    sep="\t", quoting=3)

## word2vec 모델 로딩

In [2]:
model = Word2Vec.load("300features_40minwords_10context")

In [9]:
model["flower"].shape

  """Entry point for launching an IPython kernel.


(300,)

In [24]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(16490, 300)

## word2vec 모델로 Feature들 추출

In [28]:
# Function to average all of the word vectors in a given paragraph.
def makeFeatureVec(words, model, num_features):
    # 1. pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,), dtype="float32")
    
    nwords = 0
    
    # 2. index2word contains model's vocabulary.
    # Convert to set, for speed
    index2word_set = set(model.wv.index2word)
    
    # 3. loop over each word in the review,
    # if it is in the model's vocab,
    # add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec, model[word])
    # 4. divide the result by the number of words
    # to get the average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

In [29]:
# Given a set of reviews (each one a list of words),
# calculate the average feature vector for each one
# and return a 2 dimensional numpy array
def getAvgFeatureVecs(reviews, model, num_features):
    # 1. initialize a counter
    counter = 0
    # 2. preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),
                                  num_features),
                                 dtype="float32")
    # 3. loop through the reviews
    for review in reviews:
        # a. print status at every 1000th review
        if counter % 1000 == 0:
            print("Review {} of {}".format(counter, len(reviews)))
        # b. call the function that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review,
                                                    model,
                                                    num_features)
        # c. increment the counter
        counter = counter + 1
        
    return reviewFeatureVecs

In [32]:
def review_to_wordlist(review, remove_stopwords=False):
    # 1. remove HTML
    review_text = BeautifulSoup(review).get_text()
    # 2. remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ",review_text)
    # 3. convert words to lower case and split
    words = review_text.lower().split()
    # 4. optional: remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # 5. return a list of words
    return(words)

## 학습 데이터(특징) 전처리

In [35]:
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words

In [36]:
clean_train_reviews = []

for review in train["review"]:
    clean_train_reviews.append(
        review_to_wordlist(review, remove_stopwords=True))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, 
                                  model, 
                                  num_features)

print("Creating average feature vecs for test reviews")

clean_test_reviews = []

for review in test["review"]:
    clean_test_reviews.append(
        review_to_wordlist(review, remove_stopwords=True))
    
testDataVecs = getAvgFeatureVecs(clean_test_reviews, 
                                 model, 
                                 num_features)

Review 0 of 25000




Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 

## Random Forest로 학습

In [38]:
# using random forest to train a model
forest = RandomForestClassifier(n_estimators=100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(trainDataVecs, train["sentiment"])

result = forest.predict(testDataVecs)

output = pd.DataFrame(data={"id":test["id"], "sentiment":result})

output.to_csv("W2V_AvgVec.csv", index=False, quoting=3)

Fitting a random forest to labeled training data...


In [42]:
# using clustering (k-means) to train a model
start = time.time()

# 1. set "k", number of clusters
# 1/5th of the vocab size, or an average of 5 words per cluster
word_vectors = model.wv.syn0
num_clusters = int(word_vectors.shape[0]/5)

kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: {} seconds".format(elapsed))

  


Time taken for K Means clustering: 434.6488604545593 seconds


In [43]:
# create a word / index dictionary,
# mapping each vocab word to a cluster number
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [66]:
for cluster in range(0,10):
    print("Cluster {}".format(cluster))
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if (list(word_centroid_map.values())[i] == cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)

Cluster 0
['caesar', 'avery', 'hoskins', 'thornton', 'newhart', 'saget', 'cosby', 'bewitched', 'briggs']
Cluster 1
['whip', 'rusty', 'frog', 'roaring', 'sheets', 'crushing', 'slug', 'facade', 'veil', 'pencil', 'elmo', 'cement', 'bolt', 'pancake', 'bubbles', 'roar', 'cleaver', 'booty', 'wool', 'puddle', 'sweating', 'hose', 'frying']
Cluster 2
['divide']
Cluster 3
['messages', 'symbols', 'meanings', 'metaphors', 'mechanics']
Cluster 4
['rebecca', 'daisy', 'andrea', 'paula', 'cindy', 'esther', 'alicia', 'alison', 'clara', 'lena', 'fiona', 'sadie', 'stacy', 'cecilia', 'teresa', 'erin', 'raines', 'pauline', 'iris', 'lori', 'suzanne', 'cheryl', 'mona', 'constance', 'nell', 'lindsey', 'cassie', 'flora', 'hemingway', 'hazel', 'vicki', 'abigail', 'marjorie', 'vicky', 'jen', 'dominique', 'delilah', 'ella', 'violet', 'blanche', 'hagen', 'lilly', 'margo', 'susie', 'andr', 'sobieski', 'lizzie', 'becky', 'libby', 'yvette', 'alonso', 'bai']
Cluster 5
['trained', 'protected', 'cured', 'naming', 'awake

In [83]:
def create_bag_of_centroids(wordlist, word_centroid_map):
    num_centroids = max(word_centroid_map.values()) + 1
    
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")
    
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
            
    return bag_of_centroids

In [87]:
train_centroids = np.zeros(
    (train["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = \
        create_bag_of_centroids(review, word_centroid_map)
    counter += 1
        
test_centroids = np.zeros(
    (test["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] =\
        create_bag_of_centroids(review, word_centroid_map)
    counter += 1
        
        
forest = RandomForestClassifier(n_estimators=100)

print("Fitting a random forest to labeled training data...")

forest = forest.fit(train_centroids, train["sentiment"])

result = forest.predict(test_centroids)

output = pd.DataFrame(data={"id":test["id"],
                           "sentiment":result})

output.to_csv("BagOfCentroids.csv", index=False, quoting=3)

Fitting a random forest to labeled training data...
