什么是embedding?

1. 从数学上看, 是映射
2. 从神经⽹络的⾓度看, 是层与层之间的矩阵
3. 从特征的⾓度看, 是从⼀套特征映射到另⼀种表⽰⽅式

本节课, 我们从非监督学习和监督学习两个角度, 来讨论embedding的使用.

## Gensim

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE

In [None]:
df = pd.read_csv('/usr/local/codeData/RS-J2/3--data.csv')
df.head()

In [None]:
# Create a new column for Make Model
df['Maker_Model']= df['Make']+ " " + df['Model']

# Select features from original dataset to form a new dataframe 
df1 = df[['Engine Fuel Type','Transmission Type','Driven_Wheels','Market Category','Vehicle Size', 'Vehicle Style', 'Maker_Model']]
# For each row, combine all the columns into one column
df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)
# Store them in a pandas dataframe
df_clean = pd.DataFrame({'clean': df2})
# Create the list of list format of the custom corpus for gensim modeling 
sent = [row.split(',') for row in df_clean['clean']]
# show the example of list of list format of the custom corpus for gensim modeling 
sent[:2]

In [None]:
len(sent)

In [None]:
from gensim.models import Word2Vec
#model = Word2Vec(sent, min_count=1,size= 50,workers=3, window =3, sg = 1)
#__init__() got an unexpected keyword argument 'size'
model = Word2Vec(sent, min_count=1,workers=3, window =3, sg = 1,vector_size = 50)

In [None]:
model.wv['MANUAL']

In [None]:
model.wv.similarity('Porsche 718 Cayman', 'Nissan Van')

In [None]:
model.wv.similarity('Porsche 718 Cayman', 'Mercedes-Benz SLK-Class')

In [None]:
model.wv.most_similar('Mercedes-Benz SLK-Class')[:5]

In [None]:
def display_closestwords_tsnescatterplot(model, word, size):
    arr = np.empty((0,size), dtype='f')
    word_labels = [word]
    close_words = model.wv.similar_by_word(word)
    arr = np.append(arr, np.array([model.wv[word]]), axis=0)

    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
        plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
        plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)
    plt.show()

In [None]:
display_closestwords_tsnescatterplot(model, 'Porsche 718 Cayman',50) 

In [None]:
model.train([["hello", "world"]], total_examples=1, epochs=1)

## Tensorflow

In [None]:
input_array = np.random.randint(1000, size=(32, 10))

In [None]:
input_array.shape

In [None]:
import tensorflow as tf
import keras

In [None]:
# from tf.keras.model import Sequential
# from tf.keras.layers import Embedding

model = keras.models.Sequential()
model.add(keras.layers.Embedding(1000, 64, input_length=10))
# the model will take as input an integer matrix of size (batch,
# input_length).
# the largest integer (i.e. word index) in the input should be no larger
# than 999 (vocabulary size).
# now model.output_shape == (None, 10, 64), where None is the batch
# dimension.

input_array = np.random.randint(1000, size=(32, 10))

model.compile('rmsprop', 'mse')
output_array = model.predict(input_array)
assert output_array.shape == (32, 10, 64)

## Embedding Lookup

In [None]:
with tf.compat.v1.Session() as sess:
    c = np.random.random([10, 1])
    b = tf.nn.embedding_lookup(c, [0,1,2,3])
    print(c)
    print(sess.run(b))