In [19]:
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import scale
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

In [20]:
onehot_encoder = DictVectorizer()

In [21]:
x = [
    {'city':'New York'},
    {'city':"San Francisco"},
    {'city':'Chapel Hill'}
]

In [22]:
onehot_encoder.fit_transform(x).toarray()

array([[ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.]])

In [23]:
x = np.array([
    [0,0,5,13,9,1],
    [0,0,13,15,10,15],
    [0,3,15,2,0,11]
],dtype = 'float')

In [24]:
arr2 = scale(x,axis = 1)
arr2

array([[-0.94819965, -0.94819965,  0.06772855,  1.69321365,  0.8804711 ,
        -0.74501401],
       [-1.36617904, -1.36617904,  0.64442408,  0.95374764,  0.18043874,
         0.95374764],
       [-0.8975145 , -0.37637705,  1.70817275, -0.55008953, -0.8975145 ,
         1.01332282]])

In [25]:
arr2[1,:].std()

0.99999999999999989

In [26]:
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game'
]

In [33]:
vectorizer = CountVectorizer()

In [34]:
x3 = vectorizer.fit_transform(corpus).todense()

In [35]:
vectorizer.vocabulary_

{'ate': 0,
 'basketball': 1,
 'duke': 2,
 'game': 3,
 'in': 4,
 'lost': 5,
 'played': 6,
 'sandwich': 7,
 'the': 8,
 'unc': 9}

In [30]:
corpus.append('I ate a sandwich')

In [31]:
euclidean_distances(x3[0],x3[1])

array([[ 2.44948974]])

In [36]:
euclidean_distances(x3[0], x3[2])

array([[ 2.64575131]])

In [37]:
euclidean_distances(x3[1], x3[2])

array([[ 2.64575131]])

In [38]:
vectorizer2 = CountVectorizer(stop_words = 'english')

In [39]:
vectorizer2.fit_transform(corpus).todense()

matrix([[0, 1, 1, 0, 0, 1, 0, 1],
        [0, 1, 1, 1, 1, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 1, 0]], dtype=int64)

In [40]:
vectorizer2.vocabulary_

{'ate': 0,
 'basketball': 1,
 'duke': 2,
 'game': 3,
 'lost': 4,
 'played': 5,
 'sandwich': 6,
 'unc': 7}

In [None]:
# 上下对比能够发现the、in不在了，其它的没有发生特别大的变化。

In [43]:
from nltk.stem.wordnet import WordNetLemmatizer

In [44]:
lemmatizer = WordNetLemmatizer()

In [46]:
lemmatizer.lemmatize('gathering','v')

'gather'

In [47]:
lemmatizer.lemmatize('gathering','n')

'gathering'

In [48]:
from nltk.stem import PorterStemmer

In [49]:
stemmer = PorterStemmer()

In [50]:
stemmer.stem('gathering')

'gather'

In [51]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

In [52]:
wordnet_tags = ['n','v']
corpus = [
    'He ate the sandwiches',
    'Every sandwiches was eaten by him'
]

In [56]:
stemmer = PorterStemmer()
[ [stemmer.stem(token) for token in word_tokenize(document)] for document in corpus ]

[['He', 'ate', 'the', 'sandwich'],
 ['everi', 'sandwich', 'wa', 'eaten', 'by', 'him']]

In [57]:
def lemmatize(token, tag):
    if tag[0].lower in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

In [59]:
lemmatizer = WordNetLemmatizer()
tagged_corpus = [ pos_tag(word_tokenize(document)) for document in corpus]
tagged_corpus

[[('He', 'PRP'), ('ate', 'VBD'), ('the', 'DT'), ('sandwiches', 'NNS')],
 [('Every', 'DT'),
  ('sandwiches', 'NN'),
  ('was', 'VBD'),
  ('eaten', 'VBN'),
  ('by', 'IN'),
  ('him', 'PRP')]]

In [60]:
[ [ lemmatize(token, tag) for token, tag in document ] for document in tagged_corpus]

[['He', 'ate', 'the', 'sandwiches'],
 ['Every', 'sandwiches', 'was', 'eaten', 'by', 'him']]

### 用tf-idf扩展词包

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

In [71]:
corpus = [
    'The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich'
]

In [72]:
vectorizer = CountVectorizer(stop_words = 'english')

In [76]:
frequencies = vectorizer.fit_transform(corpus).toarray() [0]
frequencies

array([2, 1, 3, 1, 1], dtype=int64)

In [77]:
for token, index in vectorizer.vocabulary_.items():
    print( 'the token {} appears {} times'.format(token, frequencies[index]))

the token sandwich appears 3 times
the token wizard appears 1 times
the token dog appears 1 times
the token ate appears 2 times
the token transfigured appears 1 times


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
corpus = [
    'The dog ate a sandwich and I ate a sandwich',
    'The wizard transfigured a sandwich'
]

In [81]:
vectorizer = TfidfVectorizer(stop_words = 'english')

In [84]:
vectorizer.fit_transform(corpus).todense()

matrix([[ 0.75458397,  0.37729199,  0.53689271,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.44943642,  0.6316672 ,  0.6316672 ]])

In [85]:
vectorizer.vocabulary_

{'ate': 0, 'dog': 1, 'sandwich': 2, 'transfigured': 3, 'wizard': 4}

### 用哈希技巧进行空间有效特征向量化

In [86]:
from sklearn.feature_extraction.text import HashingVectorizer

In [87]:
corpus = ['the','ate','bacon','cat']

In [88]:
vectorizer = HashingVectorizer(n_features = 6)

In [89]:
vectorizer.transform(corpus).todense() # 奇怪的是这里为什么不和前面的一样：用fit_transfrom?

matrix([[-1.,  0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0., -1.,  0.],
        [ 0.,  1.,  0.,  0.,  0.,  0.]])

### 单词嵌入

import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('',binary = True)
embedding = model.word_vec('cat')
model.similarity('cat','dog')
model.similarity('cat','sandwich')
model.most_similar(positive=['puppy','cat'],negative=['kitten'],topn=1)
for i in model.most_similar(positive=['saddle','painter'],negative = ['palette'],topn =3):
    print(i)

### 从图片中提取特征

In [90]:
from sklearn import datasets

In [91]:
digits = datasets.load_digits()

In [92]:
digits.target[0]

0

In [93]:
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

In [94]:
digits.images[0].reshape(1,-1)

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.,   0.,   0.,  13.,
         15.,  10.,  15.,   5.,   0.,   0.,   3.,  15.,   2.,   0.,  11.,
          8.,   0.,   0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.,   0.,
          5.,   8.,   0.,   0.,   9.,   8.,   0.,   0.,   4.,  11.,   0.,
          1.,  12.,   7.,   0.,   0.,   2.,  14.,   5.,  10.,  12.,   0.,
          0.,   0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

### 适用卷积神经网络活动作为特征

In [None]:
import os
import caffe
caffe_dir = ''
mean_path = os.path.join(caffe_dir, 'python/caffe/imagenet/ilsvrc_2012_mean.npy')
prototxt_path = os.path.join(caffe_dir, 'models/bvlc_reference_caffenet/deploy.prootxt')
caffemodel_path = os.path.join(caffe_dir, 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')
image_path = 'data/zipper-1.jpg'

In [None]:
net = caffe.Net(prototxt_path, caffemodel_path, caffe.TEST)
transformer = caffe.io.Transformer({ 'data': net.blobs['data'].data.shape })
transformer.set_transpose( 'data', ( 2, 0, 1) )
transformer.set_mean( 'data', np.load( mean_path).mean(1).mean(1) )
transformer.set_raw_scale( 'data', 255 )
transformer.set_channel_swap( 'data', ( 2, 2, 0) )

In [None]:
net.blobs['data'].reshape(1, 3, 227, 227)
net.blobs['data'].data[0] = transformer.preprocess( 'data', caffe.io.load_image(image_path) ) 

In [None]:
net.forward()
features = net.blobs['fc7'].data.reshape(-1, 1)
print(features.shape)
features