# 在word2vec上训练情感分析模型

In [1]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

In [2]:
def load_dataset(name,nrows=None):
    datasets = {
        'unlabeled_train':'unlabeledTrainData.tsv',
        'labeled_train': 'labeledtrainData.tsv',
        'test':'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('data', datasets[name])
    df = pd.read_csv(data_file,sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews:{}'.format(len(df)))
    return df

In [3]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words


## 读入之前训练好的Word2Vec模型

In [4]:
model_name='300features_40minwords_10context.model'
model= Word2Vec.load(os.path.join('models',model_name))

## 我们可以根据wors2vec的结果去对影评文本进行编码
编码方式有一点粗暴，简单来说就是把这句话中的词的词向量做平均

In [5]:
df = load_dataset('labeled_train')
df.head()

Number of reviews:25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [6]:
def to_review_vector(review):
    words = clean_text(review,remove_stopwords=True)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis=0))

In [7]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.002678,0.020226,-0.003308,-0.001329,-0.000429,0.017054,0.003922,-0.004634,-0.006502,0.018565,...,0.011296,0.007067,0.01127,-0.010123,0.006665,-0.002812,-0.010243,-0.008943,-0.002214,0.003349
1,0.011195,0.011533,-0.017525,-0.000643,0.009489,0.00793,0.01459,0.004758,-0.024102,0.017199,...,0.016904,-0.016082,0.010846,-0.005245,0.019172,0.016902,-0.007558,0.01417,-0.021832,-0.00106
2,0.009163,0.005282,0.004513,0.012272,0.016611,0.023836,0.021637,-0.023631,0.010869,0.014053,...,0.00362,0.006235,0.008987,-0.010943,-9.5e-05,0.007353,-0.002674,0.001677,-0.017052,0.029876
3,0.005886,0.002566,-0.003322,0.009299,0.020795,0.030984,0.013597,0.004954,-0.003573,0.025327,...,0.017623,-0.006736,0.007264,-0.021909,0.013897,0.003579,-0.005561,-0.010516,-0.01233,0.008115
4,0.010639,-0.002652,0.007499,0.019073,0.005217,0.015277,0.021269,-0.011926,0.007815,0.005509,...,0.002221,0.017206,0.005844,-0.000741,0.011358,-0.007292,-0.008853,-0.01111,-0.011974,0.019022


In [8]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

## 在训练集上试试，确保模型能正常work

In [9]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

## 清理占用内容的变量

In [10]:
del df
del train_data_features

## 预测测试集结果并上传kaggle

In [11]:
df = load_dataset('test')
df.head()

Number of reviews:25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [12]:
test_data_features = df.review.apply(to_review_vector)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.037929,0.016091,0.010187,-0.001538,0.011482,0.026469,-0.014943,0.021288,-0.008064,0.017148,...,0.018953,-0.020444,0.005184,0.000263,0.025891,0.009896,-0.006365,0.010164,-0.015446,0.021002
1,0.007418,0.015204,-0.012244,0.008895,-0.003015,0.013872,-0.005526,-0.001078,-0.014284,0.029322,...,0.009556,-0.00283,0.006463,0.003681,0.008423,0.007033,-0.017247,0.011278,-0.011668,0.008659
2,0.002427,0.012261,-0.019166,0.002626,0.010211,0.02521,-0.004151,0.015909,-0.008532,0.032671,...,0.018945,-0.004627,0.00212,-0.00655,0.013444,-3.3e-05,-0.012826,0.005978,-0.014659,0.002498
3,-0.011235,0.014808,-0.000349,0.012653,0.004455,0.023097,0.004729,-0.020141,-0.010708,0.029956,...,0.000519,0.005217,0.011129,-0.017832,0.006465,0.005332,-0.017498,-0.001463,-0.011936,0.013929
4,-0.006166,0.011806,0.002399,0.005566,0.022578,0.016825,-0.009955,-0.018607,-0.013055,0.010941,...,0.018135,0.004121,0.00758,-0.004052,0.015065,0.005904,-0.011766,0.004247,-0.018519,0.010808


In [13]:
result = forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv(os.path.join('data', 'Word2Vec_model.csv'), index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,0
4,12128_7,1


In [14]:
del df
del test_data_features
del forest

## 对词向量进行聚类研究和编码
使用kmeans进行聚类

In [15]:
word_vector = model.wv.syn0
num_cluster = word_vector.shape[0]

In [16]:
%%time

kmeans_clustering = KMeans(n_clusters=num_cluster,n_jobs=4)
idx = kmeans_clustering.fit_predict(word_vector)

CPU times: user 1.78 s, sys: 2.73 s, total: 4.51 s
Wall time: 22min 11s


In [31]:
word_centroid_map = dict(zip(model.wv.index2word,idx))

In [33]:
import pickle

filename='word_centroid_map_10avg.pickle'
with open(os.path.join('models',filename), 'bw') as f:
    pickle.dump(word_centroid_map,f)

## 输出一些clusters看

In [38]:
for cluster in range(0,10):
    print('\nCluster %d' % cluster)
    print([w for w,c in word_centroid_map.items() if c==cluster])


Cluster 0
['spears']

Cluster 1
['morris']

Cluster 2
['illustrate']

Cluster 3
['atlantic']

Cluster 4
['masterful']

Cluster 5
['lair']

Cluster 6
['accepting']

Cluster 7
['deliciously']

Cluster 8
['memory']

Cluster 9
['mcdormand']


## 把评论数据转成cluster bag vectors

In [54]:
wordset = set(word_centroid_map.keys())

def make_cluster_bag(review):
    words = clean_text(review,remove_stopwords=True)
    return(pd.Series([word_centroid_map[w] for w in words if w in wordset])
           .value_counts()
           .reindex(range(num_cluster+1),fill_value=0))


In [55]:
df = load_dataset('labeled_train')
df.head()

Number of reviews:25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [56]:
train_data_features = df.review.apply(make_cluster_bag)
train_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13047,13048,13049,13050,13051,13052,13053,13054,13055,13056
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest = forest.fit(train_data_features, df.sentiment)

In [58]:
confusion_matrix(df.sentiment, forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])

In [59]:
del df
del train_data_features

In [60]:
df = load_dataset('test')
df.head()

Number of reviews:25000


Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [61]:
test_data_features=df.review.apply(make_cluster_bag)
test_data_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13047,13048,13049,13050,13051,13052,13053,13054,13055,13056
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
result=forest.predict(test_data_features)
output = pd.DataFrame({'id':df.id,'sentiment':result})
output.to_csv(os.path.join('data','Word2Vec_BagOfClusters.csv'),index=False)
output.head()

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,1
3,7186_2,1
4,12128_7,1


In [63]:
del df
del test_data_features
del forest