In [20]:
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import nltk.data
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec


In [11]:
def load_dataset(name, nrows=None):
    datasets={
        'unlabeled_train':'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test':'testData.tsv'
    }
    if name not in datasets:
        raise ValueErroe(name)
    data_file = os.path.join('data',datasets[name])
    df = pd.read_csv(data_file,sep='\t', escapechar='\\', nrows=nrows)
    print('Number of review:{}'.format(len(df)))
    return df

## 读入无标签数据
用于训练生成word2vec词向量

In [13]:
df = load_dataset('unlabeled_train')
df.head()

Number of review:50000


Unnamed: 0,id,review
0,9999_0,"Watching Time Chasers, it obvious that it was ..."
1,45057_0,I saw this film about 20 years ago and remembe...
2,15561_0,"Minor Spoilers<br /><br />In New York, Joan Ba..."
3,7161_0,I went to see this film with a great deal of e...
4,43971_0,"Yes, I agree with everyone on this site this m..."


In [25]:
eng_stopwords=set(stopwords.words('english'))

def clean_text(text,remove_stopwords=False):
    text=BeautifulSoup(text,'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ',text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def print_call_counts(f):
    n=0
    def wrapped(*args, **kwargs):
        nonlocal n
        n +=1
        if n%1000==1:
            print('method {} called {} times'.format(f.__name__, n))
        return f(*args, **kwargs)
    return wrapped

@print_call_counts
def split_sentences(review):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = [clean_text(s) for s in raw_sentences if s]
    return sentences


In [38]:
%time sentences = sum(df.review.apply(split_sentences),[])
print('{}review-> {} sentences'.format(len(df),len(sentences)))

method split_sentences called 350001 times


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)


method split_sentences called 351001 times
method split_sentences called 352001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 353001 times
method split_sentences called 354001 times
method split_sentences called 355001 times
method split_sentences called 356001 times
method split_sentences called 357001 times
method split_sentences called 358001 times
method split_sentences called 359001 times
method split_sentences called 360001 times
method split_sentences called 361001 times
method split_sentences called 362001 times
method split_sentences called 363001 times
method split_sentences called 364001 times
method split_sentences called 365001 times
method split_sentences called 366001 times
method split_sentences called 367001 times
method split_sentences called 368001 times
method split_sentences called 369001 times
method split_sentences called 370001 times
method split_sentences called 371001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 372001 times
method split_sentences called 373001 times
method split_sentences called 374001 times
method split_sentences called 375001 times
method split_sentences called 376001 times
method split_sentences called 377001 times
method split_sentences called 378001 times
method split_sentences called 379001 times
method split_sentences called 380001 times
method split_sentences called 381001 times
method split_sentences called 382001 times
method split_sentences called 383001 times
method split_sentences called 384001 times
method split_sentences called 385001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 386001 times
method split_sentences called 387001 times
method split_sentences called 388001 times
method split_sentences called 389001 times
method split_sentences called 390001 times
method split_sentences called 391001 times
method split_sentences called 392001 times
method split_sentences called 393001 times
method split_sentences called 394001 times
method split_sentences called 395001 times
method split_sentences called 396001 times
method split_sentences called 397001 times
method split_sentences called 398001 times


  ' that document to Beautiful Soup.' % decoded_markup


method split_sentences called 399001 times
CPU times: user 7min 14s, sys: 36.9 s, total: 7min 51s
Wall time: 8min
50000review-> 537851 sentences


In [39]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [44]:
num_features = 300
min_word_count=40
num_workers=4
context=10
downsampling = 1e-3

model_name='{}features_{}minwords_{}context.model'.format(num_features,min_word_count,context)

In [48]:
print('Training model...')
model =Word2Vec(sentences, workers=num_workers,\
                         size=num_features,min_count=min_word_count, \
                         window = context, sample=downsampling)

model.init_sims(replace=True)

2018-06-17 22:41:12,689 : INFO : collecting all words and their counts
2018-06-17 22:41:12,692 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-17 22:41:12,752 : INFO : PROGRESS: at sentence #10000, processed 225072 words, keeping 17237 word types
2018-06-17 22:41:12,820 : INFO : PROGRESS: at sentence #20000, processed 443536 words, keeping 24570 word types
2018-06-17 22:41:12,883 : INFO : PROGRESS: at sentence #30000, processed 666343 words, keeping 29785 word types


Training model...


2018-06-17 22:41:12,947 : INFO : PROGRESS: at sentence #40000, processed 886903 words, keeping 33939 word types
2018-06-17 22:41:13,020 : INFO : PROGRESS: at sentence #50000, processed 1103863 words, keeping 37503 word types
2018-06-17 22:41:13,093 : INFO : PROGRESS: at sentence #60000, processed 1327231 words, keeping 40738 word types
2018-06-17 22:41:13,159 : INFO : PROGRESS: at sentence #70000, processed 1550828 words, keeping 43603 word types
2018-06-17 22:41:13,228 : INFO : PROGRESS: at sentence #80000, processed 1772824 words, keeping 46155 word types
2018-06-17 22:41:13,287 : INFO : PROGRESS: at sentence #90000, processed 1987492 words, keeping 48328 word types
2018-06-17 22:41:13,350 : INFO : PROGRESS: at sentence #100000, processed 2210772 words, keeping 50551 word types
2018-06-17 22:41:13,414 : INFO : PROGRESS: at sentence #110000, processed 2435496 words, keeping 52762 word types
2018-06-17 22:41:13,479 : INFO : PROGRESS: at sentence #120000, processed 2658449 words, keepin

2018-06-17 22:41:33,280 : INFO : PROGRESS: at 17.52% examples, 454743 words/s, in_qsize 7, out_qsize 0
2018-06-17 22:41:34,299 : INFO : PROGRESS: at 18.74% examples, 456906 words/s, in_qsize 7, out_qsize 0
2018-06-17 22:41:35,305 : INFO : PROGRESS: at 19.85% examples, 457614 words/s, in_qsize 8, out_qsize 0
2018-06-17 22:41:36,327 : INFO : PROGRESS: at 21.02% examples, 458972 words/s, in_qsize 8, out_qsize 1
2018-06-17 22:41:37,338 : INFO : PROGRESS: at 22.19% examples, 460442 words/s, in_qsize 8, out_qsize 0
2018-06-17 22:41:38,358 : INFO : PROGRESS: at 23.36% examples, 461279 words/s, in_qsize 7, out_qsize 0
2018-06-17 22:41:39,364 : INFO : PROGRESS: at 24.53% examples, 462593 words/s, in_qsize 7, out_qsize 0
2018-06-17 22:41:40,368 : INFO : PROGRESS: at 25.74% examples, 464445 words/s, in_qsize 8, out_qsize 0
2018-06-17 22:41:41,394 : INFO : PROGRESS: at 26.90% examples, 464861 words/s, in_qsize 7, out_qsize 0
2018-06-17 22:41:42,400 : INFO : PROGRESS: at 28.15% examples, 467274 wor

In [49]:
model.save(os.path.join('models',model_name))

2018-06-17 22:43:45,963 : INFO : saving Word2Vec object under models/300features_40minwords_10context.model, separately None
2018-06-17 22:43:45,966 : INFO : not storing attribute syn0norm
2018-06-17 22:43:45,968 : INFO : not storing attribute cum_table
2018-06-17 22:43:46,543 : INFO : saved models/300features_40minwords_10context.model


In [50]:
print(model.doesnt_match('man woman child kitchen'.split()))

kitchen


In [51]:
model.most_similar('man')

[('woman', 0.6300984621047974),
 ('lady', 0.601123571395874),
 ('lad', 0.5779431462287903),
 ('guy', 0.5540357828140259),
 ('soldier', 0.5244459509849548),
 ('person', 0.5195951461791992),
 ('boy', 0.5006827712059021),
 ('chap', 0.49829626083374023),
 ('widow', 0.4941798150539398),
 ('men', 0.49312081933021545)]

In [52]:
model.most_similar('awful')

[('terrible', 0.779263973236084),
 ('horrible', 0.7369534969329834),
 ('atrocious', 0.7309613823890686),
 ('abysmal', 0.721771240234375),
 ('dreadful', 0.6979221105575562),
 ('horrid', 0.6907047033309937),
 ('horrendous', 0.677790641784668),
 ('appalling', 0.6498734354972839),
 ('lousy', 0.6388899087905884),
 ('amateurish', 0.632544219493866)]