In [1]:
import gensim
import nltk
import numpy as np

#自制语料
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

new_doc = ['loving this blue sky today']
#tokenize corpus
TOKENIZED_CORPUS=[nltk.word_tokenize(sentence) for sentence in CORPUS]
tokenized_new_doc=[nltk.word_tokenize(sentence) for sentence in new_doc]
print(TOKENIZED_CORPUS)
print(tokenized_new_doc)
model=gensim.models.Word2Vec(TOKENIZED_CORPUS,size=10,window=10,min_count=2,sample=1e-3)

#num_features表示的文本单词大小
def average_word_vectors(words,model,vocabulary,num_features):
    feature_vector=np.zeros((num_features,),dtype='float64')
    nwords=0
    for word in words:
        if word in vocabulary:
            nwords=nwords+1
            feature_vector=np.add(feature_vector,model[word])
    if nwords:
        feature_vector=np.divide(feature_vector,nwords)
    return feature_vector

def averaged_word_vectorizer(corpus,model,num_features):
    #get the all vocabulary
    vocabulary=set(model.wv.index2word)
    features=[average_word_vectors(tokenized_sentence,model,vocabulary,num_features) for tokenized_sentence in corpus]
    return np.array(features)

avg_word_vec_features=averaged_word_vectorizer(TOKENIZED_CORPUS,model=model,num_features=10)
print(avg_word_vec_features)

nd_avg_word_vec_features=averaged_word_vectorizer(corpus=tokenized_new_doc,model=model,num_features=10)
print(nd_avg_word_vec_features)


[['the', 'sky', 'is', 'blue'], ['sky', 'is', 'blue', 'and', 'sky', 'is', 'beautiful'], ['the', 'beautiful', 'sky', 'is', 'so', 'blue'], ['i', 'love', 'blue', 'cheese']]
[['loving', 'this', 'blue', 'sky', 'today']]
[[ 0.00927303  0.02162506 -0.01460403 -0.00401924  0.00250394 -0.00629804
  -0.00401768 -0.00306395  0.0192884  -0.01135796]
 [ 0.00364845  0.01165146 -0.00032804 -0.01386576  0.00595418  0.00139123
  -0.01364638  0.01582027  0.0173904  -0.00911294]
 [-0.00072099  0.02206395 -0.00713337 -0.00304041  0.00782759 -0.00203281
  -0.00206481  0.00022724  0.0195482  -0.01270212]
 [ 0.00806398  0.04998796 -0.03547955  0.01987278 -0.00656114  0.03626325
   0.02452678 -0.03793738 -0.02828991  0.01549027]]
[[-5.14067942e-03  3.62036284e-02 -6.71307649e-03  5.05944155e-03
   4.71267011e-03 -7.53486529e-04 -1.14511903e-02 -1.03723258e-04
   3.48817557e-05 -1.34660294e-02]]


In [2]:
def tfidf_wtd_avg_word_vectors(words,tfidf_vector,tfidf_vocabulary,model,num_features):
    # print("tfidf_vector", tfidf_vector)
    # print("tfidf_vocabulary", tfidf_vocabulary)
    word_tfidfs=[tfidf_vector[0,tfidf_vocabulary.get(word)] if tfidf_vocabulary.get(word) else 0 for word in words]
    # print("word_tfidfs", word_tfidfs)
    word_tfidf_map={word:tfidf_val for word,tfidf_val in zip(words,word_tfidfs)}
    # print("word_tfidf_map", word_tfidf_map)
    feature_vector=np.zeros((num_features,),dtype='float64')
    vocabulary=set(model.wv.index2word)
    wts=0
    for word in words:
        if word in vocabulary:
            word_vector=model[word]
            weighted_word_vector=word_tfidf_map[word]*word_vector
            wts=wts+word_tfidf_map[word]
            feature_vector=np.add(feature_vector,weighted_word_vector)
    if wts:
        feature_vector=np.divide(feature_vector,wts)
    return feature_vector

def tfidf_weighted_averaged_word_vectorizer(corpus,tfidf_vectors,tfidf_vocabulary,model,num_features):
    docs_tfidfs=[(doc,doc_tfidf) for doc,doc_tfidf in zip(corpus,tfidf_vectors)]
    features=[tfidf_wtd_avg_word_vectors(tokenized_sentence,tfidf,tfidf_vocabulary,model,num_features) for tokenized_sentence,tfidf in docs_tfidfs]
    return np.array(features)

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def tfidf_transformer(bow_matrix):
    
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

def tfidf_extractor(corpus, ngram_range=(1,1)):
    
    vectorizer = TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

def bow_extractor(corpus, ngram_range=(1,1)):
    #min_df为1说明文档中词频最小为1也会被考虑
    #ngram_range可以设置(1,3)将建立包括所有unigram、bigram、trigram的向量空间
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features


def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print(df)

bow_vectorizer, bow_features = bow_extractor(CORPUS)
feature_names = bow_vectorizer.get_feature_names()
tfidf_trans, tfidf_features = tfidf_transformer(bow_features)
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)

corpus_tfidf=tfidf_features
vocab=tfidf_vectorizer.vocabulary_

wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,tfidf_vectors=corpus_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10)

print(wt_tfidf_word_vec_features)

and  beautiful  blue  cheese    is  love   sky    so   the
0  0.00       0.00  0.40    0.00  0.49  0.00  0.49  0.00  0.60
1  0.44       0.35  0.23    0.00  0.56  0.00  0.56  0.00  0.00
2  0.00       0.43  0.29    0.00  0.35  0.00  0.35  0.55  0.43
3  0.00       0.00  0.35    0.66  0.00  0.66  0.00  0.00  0.00
   and  beautiful  blue  cheese   is  love   sky   so  the
0  0.0        0.0  0.63     0.0  0.0   0.0  0.77  0.0  0.0
[[ 0.00889181  0.0213254  -0.01446018 -0.00393169  0.00334673 -0.01016883
  -0.00417116 -0.0032925   0.02317886 -0.01393146]
 [ 0.00650283  0.00621648  0.00205088 -0.01895563  0.00566368 -0.00374799
  -0.01961181  0.02232814  0.02252023 -0.01132472]
 [-0.0026915   0.02190799 -0.00576854 -0.00280893  0.00936754 -0.00428316
  -0.0018545   0.00060479  0.02257352 -0.01490022]
 [ 0.00806398  0.04998796 -0.03547955  0.01987278 -0.00656114  0.03626325
   0.02452678 -0.03793738 -0.02828991  0.01549027]]
