## 4.1　从类别变量中提取特征

代码4.1

In [1]:
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
X= [
   {'city': 'New York'},
   {'city': 'San Francisco'},
   {'city': 'Chapel Hill'}
]
print(onehot_encoder.fit_transform(X).toarray())

[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]


## 4.2　特征标准化

代码4.2

In [2]:
from sklearn import preprocessing
import numpy as np
X = np.array([
 [0., 0., 5., 13., 9., 1.],
 [0., 0., 13., 15., 10., 15.],
 [0., 3., 15., 2., 0., 11.]
])
print(preprocessing.scale(X))

[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]


## 4.3　从文本中提取特征

### 4.3.1　词袋模型

代码4.3

In [3]:
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]

代码4.4

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 1 0 1 0 1 0 1]
 [1 1 1 0 1 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 1, 'in': 3, 'basketball': 0, 'lost': 4, 'the': 6, 'game': 2}


代码4.5

In [5]:
corpus.append('I ate a sandwich')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{'unc': 9, 'played': 6, 'duke': 2, 'in': 4, 'basketball': 1, 'lost': 5, 'the': 8, 'game': 3, 'ate': 0, 'sandwich': 7}


代码4.6

In [6]:
from sklearn.metrics.pairwise import euclidean_distances
X = vectorizer.fit_transform(corpus).todense()
X=np.array(X)
print('Distance between 1st and 2nd documents:',euclidean_distances(X[0].reshape(1, -1), X[1].reshape(1, -1)))
print('Distance between 1st and 3rd documents:', euclidean_distances(X[0].reshape(1, -1), X[2].reshape(1, -1)))
print('Distance between 2nd and 3rd documents:',euclidean_distances(X[1].reshape(1, -1), X[2].reshape(1, -1)))

Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 2nd and 3rd documents: [[2.64575131]]


### 4.3.2　停用词过滤

代码4.7

In [7]:
vectorizer = CountVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{'unc': 7, 'played': 5, 'duke': 2, 'basketball': 1, 'lost': 4, 'game': 3, 'ate': 0, 'sandwich': 6}


### 4.3.3　词干提取和词形还原

代码4.8

In [8]:
corpus = [
    'He ate the sandwiches','Every sandwich was eaten by him'
]
vectorizer = CountVectorizer(binary=True, stop_words='english')
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

[[1 0 0 1]
 [0 1 1 0]]
{'ate': 0, 'sandwiches': 3, 'sandwich': 2, 'eaten': 1}


代码4.9

In [9]:
corpus = [
    'I am gathering ingredients for the sandwich.',
    'There were many wizards at the gathering.'
]

代码4.10

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\liye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('gathering', 'v'))
print(lemmatizer.lemmatize('gathering', 'n'))

gather
gathering


代码4.11

In [12]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('gathering'))

gather


代码4.12

In [13]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\liye\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [14]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

wordnet_tags = ['n', 'v']
corpus = [
    'He ate the sandwiches',
    'Every sandwich was eaten by him'
]
stemmer = PorterStemmer()
print('Stemmed:', [[stemmer.stem(token) for token in
word_tokenize(document)] for document in corpus])

def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in
  corpus]
print('Lemmatized:', [[lemmatize(token, tag) for token, tag in
   document] for document in tagged_corpus])

Stemmed: [['he', 'ate', 'the', 'sandwich'], ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]
Lemmatized: [['He', 'eat', 'the', 'sandwich'], ['Every', 'sandwich', 'be', 'eat', 'by', 'him']]


### 4.3.4　tf-idf权重扩展词包

代码4.13

In [15]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich,and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
frequencies = np.array(vectorizer.fit_transform(corpus).todense())[0]
print(frequencies)
print('Token indices %s' % vectorizer.vocabulary_)
for token, index in vectorizer.vocabulary_.items():
    print('The token "%s" appears %s times' % (token,
      frequencies[index]))

[2 1 3 1 1]
Token indices {'dog': 1, 'ate': 0, 'sandwich': 2, 'wizard': 4, 'transfigured': 3}
The token "dog" appears 1 times
The token "ate" appears 2 times
The token "sandwich" appears 3 times
The token "wizard" appears 1 times
The token "transfigured" appears 1 times


代码4.14

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(corpus).todense())

[[0.75458397 0.37729199 0.53689271 0.         0.        ]
 [0.         0.         0.44943642 0.6316672  0.6316672 ]]


### 4.3.5　空间有效特征向量化与哈希技巧

代码4.15

In [17]:
from sklearn.feature_extraction.text import HashingVectorizer

corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer = HashingVectorizer(n_features=6)
print(vectorizer.transform(corpus).todense())

[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]


### 4.3.6　词向量

代码4.16

In [18]:
# See https://radimrehurek.com/gensim/install.html for gensim
# installatio instructions
# Download and gunzip the word2vec embeddings from
# https://github.com/mmihaltz/word2vec-GoogleNews-vectors/blob/master/GoogleNews-vectors-negative300.bin.gz
# The 1.5GB compressed file decompresses to 3.4GB.
import gensim

# The model is large; >= 8GB of RAM is required

model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

# Let's inspect the embedding for "cat"
embedding = model.word_vec('cat')
print("Dimensions: %s" % embedding.shape)
print(embedding)

Dimensions: 300
[ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656  0.08056641 -0.5859375
 -0.00445557 -0.296875   -0.01312256 -0.08349609  0.05053711  0.15136719
 -0.44921875 -0.0135498   0.21484375 -0.14746094  0.22460938 -0.125
 -0.09716797  0.24902344 -0.2890625   0.36523438  0.41210938 -0.0859375
 -0.07861328 -0.19726562 -0.09082031 -0.14160156 -0.10253906  0.13085938
 -0.00346375  0.07226562  0.04418945  0.34570312  0.07470703 -0.11230469
  0.06738281  0.11230469  0.01977539 -0.12353516  0.20996094 -0.07226562
 -0.02783203  0.05541992 -0.33398438  0.08544922  0.34375     0.13964844
  0.04931641 -0.13476562  0.16308594 -0.37304688  0.39648438  0.10693359
  0.22167969  0.21289062 -0.08984375  0.20703125  0.08935547 -0.08251953
  0.05957031  0.10205078 -0.19238281 -0.09082031  0.4921875   0.03955078
 -0.07080078 -0.0019989  -0.23046875  0.25585938  0.08984375 -0.10644531
  0.00105286 -0.05883789  0.05102539 -0.02

  embedding = model.word_vec('cat')


In [19]:
# The vectors for semantically similar words are more similar than the vectors for semantically dissimilar words
print(model.similarity('cat', 'dog'))
print(model.similarity('cat', 'sandwich'))

0.76094574
0.17211203


In [20]:
# Puppy is to cat as kitten is to...
print(model.most_similar(positive=['puppy', 'cat'], negative=['kitten'],topn=1))

[('dog', 0.7762665748596191)]


In [21]:
# Palette is to painter as saddle is to...
for i in model.most_similar(positive=['saddle', 'painter'], negative=['palette'], topn=3):
    print(i)

('saddles', 0.5282258987426758)
('horseman', 0.5179382562637329)
('jockey', 0.48861294984817505)


## 4.4　从图像中提取特征

### 4.4.1　从像素强度中提取特征

代码4.17

In [22]:
from sklearn import datasets

digits = datasets.load_digits()
print('Digit: %s' % digits.target[0])
print(digits.images[0])
print('Feature vector:\n %s' % digits.images[0].reshape(-1, 64))

Digit: 0
[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
Feature vector:
 [[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
  15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
   0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
   0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]]


### 4.4.2　使用卷积神经网络激活项作为特征

代码4.18