# Gensim 文本处理

In [37]:
import pandas as pd
import numpy as np
import pickle
import random
import re
import os
from pprint import pprint

from time import time

import gensim
from gensim.models import Word2Vec, Doc2Vec
from gensim import utils

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from collections import defaultdict
from smart_open import open
from six import iteritems

In [38]:
txt_path = 'https://radimrehurek.com/gensim/mycorpus.txt'

# 小数据处理

## 文本
list

In [39]:
def preprocessor1(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower())
    # text = text.split()
    return text
token_review = [preprocessor1(i) for i in open(txt_path) ]

In [42]:
token_review[0]

'human machine interface for lab abc computer applications '

## 停用词

In [40]:
custom_stopwords = [] # 自定义停用词列表
STOPWORDS = stopwords.words("english") + list(ENGLISH_STOP_WORDS) + custom_stopwords
STOPWORDS = list(set(STOPWORDS))

## 分词 转小写 删除停用词

In [43]:
texts = [[word for word in document.lower().split() if word not in STOPWORDS]
         for document in token_review]

## 词频计算

留下出现次数大于X的词

In [44]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

X = 1
processed_corpus = [[token for token in text if frequency[token] > X] for text in texts]
pprint(processed_corpus)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'response', 'time'],
 ['eps', 'user', 'interface'],
 ['human', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


## 单词与整数ID的map

In [45]:
dictionary = gensim.corpora.Dictionary(processed_corpus)
dictionary.save('./idx2words_map.dict')
print(dictionary)
num_features = 11

Dictionary(11 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)


In [46]:
dictionary.dfs

{1: 2, 2: 2, 0: 2, 4: 2, 6: 3, 3: 2, 5: 2, 7: 2, 8: 3, 9: 3, 10: 2}

给所有词指定唯一的id

In [47]:
pprint(dictionary.token2id)

{'computer': 0,
 'eps': 7,
 'graph': 9,
 'human': 1,
 'interface': 2,
 'minors': 10,
 'response': 3,
 'survey': 4,
 'time': 5,
 'trees': 8,
 'user': 6}


用doc2bow字典的方法为文档创建词袋表示法：

每个元组中的第一个条目对应于字典中令牌的ID，第二个条目对应于此令牌的计数。

In [48]:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
gensim.corpora.MmCorpus.serialize('./texts_bow.mm', bow_corpus)
pprint(bow_corpus)

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (6, 1), (7, 1)],
 [(1, 1), (7, 1)],
 [(3, 1), (5, 1), (6, 1)],
 [(8, 1)],
 [(8, 1), (9, 1)],
 [(8, 1), (9, 1), (10, 1)],
 [(4, 1), (9, 1), (10, 1)]]


## 训练tf-idf模型

In [49]:
tfidf = gensim.models.TfidfModel(bow_corpus)

该tfidf模型再次返回一个元组列表，其中第一个条目是令牌ID，第二个条目是tf-idf权重。

In [50]:
print(tfidf[bow_corpus[0]])

[(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)]


## 相似性查询

In [51]:
index = gensim.similarities.SparseMatrixSimilarity(tfidf[bow_corpus],num_features=num_features)
query_bow = bow_corpus[0]
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.99999994), (1, 0.27115756), (2, 0.36272496), (3, 0.40824828), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [52]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

0 0.99999994
3 0.40824828
2 0.36272496
1 0.27115756
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0


# 大数据

## 语料流
每行一个文本 词间以空格隔开

语料库不加载到RAM，因为一次最多只有一个向量驻留在RAM中。语料库现在可以随您想要的大小而变大。

In [53]:
class MyCorpusRaw(object):
    def __iter__(self):
        for line in open(txt_path):
            yield line.lower().split()

corpus_raw_memory_friendly = MyCorpusRaw()
dictionary_ = gensim.corpora.Dictionary(corpus_raw_memory_friendly)

In [54]:
custom_stopwords = [] # 自定义停用词列表
STOPWORDS = stopwords.words("english") + list(ENGLISH_STOP_WORDS) + custom_stopwords
STOPWORDS = list(set(STOPWORDS))
stop_ids = [
    dictionary.token2id[stopword]
    for stopword in STOPWORDS
    if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary_.dfs) if docfreq == 1]
dictionary_.filter_tokens(stop_ids + once_ids)
dictionary_.compactify()

In [55]:
class MyCorpus(object):
    def __iter__(self):
        for line in open(txt_path):
            yield dictionary_.doc2bow(line.lower().split())
corpus_memory_friendly = MyCorpus()
for vector in corpus_memory_friendly:
    print(vector)

[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]
[(2, 1), (7, 1), (9, 1), (10, 1), (11, 1)]
[(1, 1), (4, 1), (7, 2), (10, 1), (12, 1)]
[(4, 1), (5, 1), (8, 1), (9, 1)]
[(4, 1), (11, 1), (13, 1)]
[(4, 1), (11, 1), (13, 1), (14, 1)]
[(4, 1), (12, 1), (13, 1), (14, 1), (15, 1)]
[(3, 1), (6, 1), (14, 1), (15, 1)]


## word2vec

In [65]:
def preprocessor2(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text.split()
    return text
token_review = [preprocessor2(i) for i in open(txt_path) ]

In [69]:
vector_dim = 200
model = gensim.models.Word2Vec(token_review, min_count=2, size=vector_dim)
#model.train(sentences,total_examples=len(sentences),epochs=10)
model.save('word2vectors.bin')
model.wv.save_word2vec_format('word2vectors.txt', binary=False)

相似词

In [70]:
model.wv.most_similar(positive="human",topn=6)

[('of', 0.10442149639129639),
 ('response', 0.0843486338853836),
 ('system', 0.05847841501235962),
 ('trees', 0.0459064245223999),
 ('survey', 0.04115157574415207),
 ('time', 0.030271658673882484)]

In [71]:
w1 = ["human",'computer']
w2 = ['time'] # 排除
model.wv.most_similar(positive=w1,negative=w2,topn=10)

[('of', 0.12239014357328415),
 ('response', 0.07446271181106567),
 ('survey', 0.039067476987838745),
 ('graph', 0.02384733408689499),
 ('user', 0.0014887526631355286),
 ('system', -0.0035650990903377533),
 ('trees', -0.019153958186507225),
 ('a', -0.033130671828985214),
 ('minors', -0.04246136546134949),
 ('the', -0.05228729918599129)]

相似度

In [72]:
model.wv.similarity(w1="human",w2="user")

0.0036251114

非同类

In [73]:
model.wv.doesnt_match(["human","user","trees"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'human'

vector 向量

In [74]:
model.wv["trees"]

array([ 2.3988045e-03,  1.2539937e-03,  6.9262099e-04,  4.9847364e-04,
       -1.8678217e-03, -8.5406253e-05, -2.3130092e-03,  1.0084164e-03,
       -8.6981483e-04,  1.5369804e-03,  2.4022781e-03, -2.1757684e-03,
       -2.1571585e-04,  1.3833324e-03, -9.9270116e-04, -2.0988514e-03,
       -2.1577813e-03,  2.4105872e-03,  8.2103058e-04,  2.0665555e-04,
        1.4734926e-03, -1.7346102e-03,  4.8905431e-04,  6.4237748e-04,
        1.8797844e-03,  1.0360393e-03,  8.0722343e-04, -2.3487874e-03,
       -2.3470134e-03,  2.1651248e-03,  1.2582765e-03, -7.2150916e-04,
        1.0646669e-03,  1.6872594e-03, -2.1409900e-03,  7.5836171e-04,
       -6.7712233e-04, -1.7152717e-03,  4.4151231e-05, -1.0926957e-03,
        1.7349893e-03, -1.4390079e-03, -1.6277698e-03,  2.3250822e-03,
        1.5569222e-03,  1.3235764e-03, -2.0074495e-03, -1.8302349e-03,
       -2.0173925e-03, -1.4989993e-03,  7.2281488e-04, -6.9727673e-04,
        9.7435206e-04, -1.5983926e-04, -2.0526850e-03,  1.5874363e-03,
      

## 预训练模型

In [75]:
dims = [100, 600]
for size in dims:
    #instantiate our  model
    model_w2v = Word2Vec(min_count=2, window=5, size=size, sample=1e-3, negative=5, workers=4, sg=0)

    #build vocab over all reviews
    model_w2v.build_vocab(token_review)

    #We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
    Idx=list(range(len(token_review)))

    t0 = time()
    for epoch in range(5):
         print(epoch+1, "/5 epochs")
         random.shuffle(Idx)
         perm_sentences = [token_review[i] for i in Idx]
         model_w2v.train(perm_sentences,total_examples=len(Idx), epochs = 1)

    elapsed=time() - t0
    print("Time taken for Word2vec training: ", elapsed/60, " (mins).")

    # saves the word2vec model to be used later.
    #model_w2v.save('./model_word2vec_skipgram_300dim')

    # open a saved word2vec model
    #model_w2v=gensim.models.Word2Vec.load('./model_word2vec')

    model_w2v.wv.save_word2vec_format('./data/model_word2vec_v2_%ddim.txt'%size, binary=False)
    print("similar words of 'human'", model_w2v.wv.most_similar('human') )

1 /5 epochs
2 /5 epochs
3 /5 epochs
4 /5 epochs
5 /5 epochs
Time taken for Word2vec training:  0.0003178755442301432  (mins).
similar words of 'human' [('computer', 0.10129371285438538), ('survey', 0.0811525285243988), ('a', 0.07710925489664078), ('system', 0.07218603044748306), ('of', 0.07031609117984772), ('and', 0.06266631186008453), ('user', 0.061590321362018585), ('trees', -0.004959670826792717), ('response', -0.035101912915706635), ('graph', -0.05454497039318085)]
1 /5 epochs
2 /5 epochs
3 /5 epochs
4 /5 epochs
5 /5 epochs
Time taken for Word2vec training:  0.0003329038619995117  (mins).
similar words of 'human' [('response', 0.07876017689704895), ('survey', 0.03825037181377411), ('of', 0.00648870412260294), ('computer', 0.0046798065304756165), ('and', 0.004269154742360115), ('user', -0.0034093516878783703), ('system', -0.016295716166496277), ('minors', -0.016591623425483704), ('time', -0.03170005977153778), ('eps', -0.032730832695961)]
