In [21]:
# 2020-10-18 created by Akson

In [22]:
# code4.1
# one-hot编码测试

from sklearn.feature_extraction import DictVectorizer

onehot_encoder = DictVectorizer()
X = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}]

print(onehot_encoder.fit_transform(X).toarray())

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [23]:
# code4.2
# 特征标准化

from sklearn import preprocessing
import numpy as np

X = np.array([[0., 0., 5., 13., 9., 1.],
              [0., 0., 13., 15., 10., 15.],
              [0., 3., 15., 2., 0., 11.]])

print(preprocessing.scale(X))

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


In [24]:
# code4.3

corpus = ['UNC played Duke in basketball', 'Duke lost the basketball game']

In [25]:
# code4.4

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


In [26]:
# code4.5

corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


In [28]:
# code4.6
# 使用欧几里得范数对文档进行特征向量比较

from sklearn.metrics.pairwise import euclidean_distances

X = vectorizer.fit_transform(corpus).todense()

print('Distance between 1st and 2nd documents:', euclidean_distances(X[0], X[1]))
print('Distance between 1st and 3rd documents:', euclidean_distances(X[0], X[2]))
print('Distance between 2nd and 3rd documents:', euclidean_distances(X[1], X[2]))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


In [29]:
# code4.7
# 停用词测试

vectorizer = CountVectorizer(stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


In [30]:
# code4.8

corpus = ['He ate the sandwiches', 'Every sandwich was eaten by him']
vectorizer = CountVectorizer(binary = True, stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


In [31]:
# code4.9
corpus = ['I am gathering ingredients for the sandwich.', 'There were many wizards at the gathering.']

In [35]:
# code4.10
# 词形还原

# 第一次使用时运行以下两行可以自动安装这个wordnet包
# import nltk
# nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering', 'v'))
print(lemmatizer.lemmatize('gathering', 'n'))

gather
gathering


In [36]:
# code4.11
# 词干提取

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


In [52]:
# code4.12
# 对语料库进行词形还原

# 第一次运行还请输入以下三条语句安装依赖包
# import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

wordnet_tags = ['n', 'v']
corpus = ['He ate the sandwiches', 'Every sandwich was eaten by him']
stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus])

# 如果是名词或动词就考虑转换一下，不是就不转了
def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        # print(lemmatizer.lemmatize(token, tag[0].lower()))
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()

# 获得一段语料库中的每个词的词性
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
# print(tagged_corpus)

print('Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus])

Stemmed: [['He', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


In [68]:
# code4.13
# 基于词频数的表示

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']

vectorizer = CountVectorizer(stop_words = 'english')  # 这里参数中去掉了binary = True
frequencies = np.array(vectorizer.fit_transform(corpus).todense())[0]
print(frequencies)
print('Token indices %s' % vectorizer.vocabulary_)

for token, index in vectorizer.vocabulary_.items():
    print('The token "%s" appears %s times' % (token, frequencies[index]))

[2 1 3 1 1]
Token indices {'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}
The token "dog" appears 1 times
The token "ate" appears 2 times
The token "sandwich" appears 3 times
The token "wizard" appears 1 times
The token "transfigured" appears 1 times


In [70]:
# code4.14
# 词频-逆文档频率表示

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['The dog ate a sandwich and I ate a sandwich',
          'The wizard transfigured a sandwich']

vectorizer = TfidfVectorizer(stop_words = 'english')
print(vectorizer.fit_transform(corpus).todense())
# print('Token indices %s' % vectorizer.vocabulary_)

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]


In [77]:
# code4.15
# 哈希技巧

from sklearn.feature_extraction.text import HashingVectorizer

corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features = 6)
print(vectorizer.transform(corpus).todense())

[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]


In [82]:
# code4.16

# 这里需要在本python环境终端中输入pip install gensim来安装这个依赖包(conda install不好使)
import gensim

# 需要自行下载GoogleNews-vectors-negative300.bin.gz并解压成GoogleNews-vectors-negative300.bin再放入此目录中
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

# 因为引入时间较长，为了方便测试我把它分成了多块代码

In [83]:
# 打印“cat”的词向量试试
embedding = model.word_vec('cat')
print('Dimensions: %s' % embedding.shape)
print(embedding)

Dimensions: 300
[ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656  0.08056641 -0.5859375
 -0.00445557 -0.296875   -0.01312256 -0.08349609  0.05053711  0.15136719
 -0.44921875 -0.0135498   0.21484375 -0.14746094  0.22460938 -0.125
 -0.09716797  0.24902344 -0.2890625   0.36523438  0.41210938 -0.0859375
 -0.07861328 -0.19726562 -0.09082031 -0.14160156 -0.10253906  0.13085938
 -0.00346375  0.07226562  0.04418945  0.34570312  0.07470703 -0.11230469
  0.06738281  0.11230469  0.01977539 -0.12353516  0.20996094 -0.07226562
 -0.02783203  0.05541992 -0.33398438  0.08544922  0.34375     0.13964844
  0.04931641 -0.13476562  0.16308594 -0.37304688  0.39648438  0.10693359
  0.22167969  0.21289062 -0.08984375  0.20703125  0.08935547 -0.08251953
  0.05957031  0.10205078 -0.19238281 -0.09082031  0.4921875   0.03955078
 -0.07080078 -0.0019989  -0.23046875  0.25585938  0.08984375 -0.10644531
  0.00105286 -0.05883789  0.05102539 -0.02

In [84]:
# 测试一些单词的相似性
print(model.similarity('cat', 'dog'))
print(model.similarity('cat', 'sandwich'))

0.76094574
0.17211203


In [88]:
# 看看‘puppy’对于‘cat’相当于‘kitten’对于什么

print(model.most_similar(positive = ['puppy', 'cat'], negative = ['kitten'], topn = 1))

[('dog', 0.7762665748596191)]


In [89]:
# 再试试别的

for i in model.most_similar(positive = ['saddle', 'painter'], negative = ['palette'], topn = 3):
    print(i)

('saddles', 0.5282258987426758)
('horseman', 0.5179382562637329)
('jockey', 0.48861294984817505)


In [99]:
# code4.17

from sklearn import datasets

digits = datasets.load_digits()
print('Digit: %s' % digits.target[0])
print(digits.images[0])
print('Feature vector: \n %s' % digits.images[0].reshape(-1, 64))

Digit: 0
[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
Feature vector: 
 [[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]]


In [None]:
# code4.18
# 这个caffe有毒，深度学习先放一放