In [1]:
import keras
from keras.layers import Input, Dense, Embedding, Flatten, Dropout, Convolution1D, MaxPooling1D
from keras.layers.merge import Concatenate
from keras.models import Model
from person2vec import data_handler
from person2vec.generators import training_data_generator

Using TensorFlow backend.


In [2]:
input_tensor_words = Input(shape=(32,300,), dtype='float32', name='word_input')
input_tensor_person = Input(shape=(4,), dtype='int32', name='person_input')

#word_embedding_layer = Embedding(10000, 300, input_length=32, name='word_embedding', trainable=False)(input_tensor_words)
word_embedding_layer = Flatten()(input_tensor_words)
word_embedding_layer = Dropout(0.)(word_embedding_layer)


person_embedding_layer = Embedding(1693, 300, input_length=4, name='person_embedding')(input_tensor_person)
person_embedding_layer = Flatten()(person_embedding_layer)
person_embedding_layer = Dropout(0.)(person_embedding_layer)


joint_embeds = Concatenate(name='joint_embeds')([word_embedding_layer, person_embedding_layer])


nex = Dropout(0.)(joint_embeds)
nex = Dense(100, activation="relu", name='dense_consolidator')(nex)
nex = Dropout(0.)(nex)
full_out = Dense (4, activation='softmax', name='final_output')(nex)

model = Model([input_tensor_words, input_tensor_person], full_out) 
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
person_input (InputLayer)        (None, 4)             0                                            
____________________________________________________________________________________________________
word_input (InputLayer)          (None, 32, 300)       0                                            
____________________________________________________________________________________________________
person_embedding (Embedding)     (None, 4, 300)        507900      person_input[0][0]               
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 9600)          0           word_input[0][0]                 
___________________________________________________________________________________________

In [3]:
embed_train_generator = training_data_generator.EmbeddingDataGenerator(word_vec_size=300, num_compare_entities=4)

In [4]:
embed_gen = embed_train_generator.flow_from_db()

In [5]:
opt = keras.optimizers.adam()
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
handler = data_handler.DataHandler()
handler.snippet_count()

In [7]:
model.fit_generator(embed_gen, steps_per_epoch=1024, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f925714bbe0>

In [8]:
embed_train_generator.entity_dict

{'Mark Amodei': 528,
 'DJ Shadow': 697,
 'Dana Rohrabacher': 405,
 'Ed Burns': 1446,
 'David M. Cote': 170,
 'Grace Napolitano': 453,
 'Jeb Bush': 345,
 'James Van Der Beek': 1512,
 'Lorenzo Lamas': 1600,
 'Salma Hayek': 1257,
 'Anne Bancroft': 1000,
 'Jason Weaver': 1522,
 'Ralph Lauren': 212,
 'Daymond John': 157,
 'Bridget Fonda': 1012,
 'Cybill Shepherd': 1042,
 'Duke Ellington': 663,
 'Dale Evans': 1044,
 'Marty Feldman': 1619,
 'James Taylor': 763,
 'Tracy Chapman': 808,
 'Reese Witherspoon': 1243,
 'Jamie Lee Curtis': 1102,
 'Joseph Gordon-Levitt': 1569,
 'Eddie Cibrian': 1448,
 'Dusty Springfield': 689,
 'Lykke Li': 837,
 'Mickey Rourke': 1651,
 'Willie Nelson': 738,
 'George Jones': 841,
 'Matt Gaetz': 538,
 'Eminem': 664,
 'Nusrat Fateh Ali Khan': 943,
 'Janet L. Robinson': 200,
 'Mark Ruffalo': 295,
 'Audrey Hepburn': 1004,
 'Norma Torres': 555,
 'Helen Slater': 1095,
 'Gram Parsons': 715,
 "Beto O'Rourke": 372,
 'Art Blakey': 754,
 'Chet Baker': 923,
 'Gene Vincent': 766,
 

In [16]:
person_embed_weights = model.layers[2].get_weights()
person_embed_weights

[array([[-0.10181135,  0.33098361,  0.1455138 , ..., -0.14185701,
          0.18652987,  0.46095154],
        [-0.13405763,  0.24737066,  0.27919865, ...,  0.2292188 ,
         -0.09322832,  0.31825969],
        [ 0.04130258,  0.20430538, -0.04839449, ..., -0.09894004,
         -0.25822175,  0.4408938 ],
        ..., 
        [-0.07178044, -0.18190691, -0.04577689, ...,  0.14865156,
         -0.21562587, -0.42136288],
        [-0.39643967,  0.20099366,  0.02895815, ...,  0.32324314,
         -0.13149706, -0.13673708],
        [ 0.30256557, -0.20097786, -0.27479565, ..., -0.2977263 ,
          0.12228461,  0.14356583]], dtype=float32)]

In [26]:
model.save_weights('../person2vec/data/weights/embed_weights_1.h5')

In [18]:
len(person_embed_weights[0])

1693

In [22]:
name_and_number = pandas.DataFrame.from_dict(embed_train_generator.entity_dict, orient='index')
name_and_number.head()

Unnamed: 0,0
Mark Amodei,528
DJ Shadow,697
Dana Rohrabacher,405
Ed Burns,1446
David M. Cote,170


In [19]:
import pandas
from sklearn.utils import shuffle
import gensim
import numpy as np
import matplotlib.pyplot as plt

In [None]:
raw_data = pandas.DataFrame.from_csv(path='../data/people_attributes.csv', index_col=None)
raw_data = raw_data.set_index('name', drop=False)
raw_data.head()

In [None]:
raw_data['gender'].replace('female', 0, inplace=True)
raw_data['gender'].replace('male', 1, inplace=True)
raw_data.head()

In [None]:
name_and_gender = raw_data.drop(['occupation','description'], axis=1)
#name_and_gender = name_and_gender.reset_index(inplace=True)
name_and_gender.head()

In [None]:
name_and_gender = shuffle(name_and_gender)
name_and_gender.head()

In [None]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format('../data/GoogleNews-vectors-negative300.bin', binary=True) 

In [None]:
name_and_gender.index.values

In [None]:
def _name_not_has_vec(name):
    try:
        word2vec.word_vec(name.replace(' ','_'))
        return False
    except:
        return True

In [None]:
truncated_name_and_gender = name_and_gender.drop([name for name in name_and_gender.index.values if _name_not_has_vec(name)])

In [None]:
truncated_name_and_gender.describe()

In [None]:
truncated_name_and_gender.head()

In [None]:
name_and_vector = truncated_name_and_gender.drop(['gender'], axis=1)
name_and_vector.head()

In [None]:
def _get_vector(row):
    return word2vec.word_vec(row.replace(' ','_')).flatten()

In [None]:
name_and_vector = name_and_vector.applymap(_get_vector)
name_and_vector.head()

In [None]:
name_and_vector.columns = ['vector']
name_and_vector.head()

In [None]:
vectors = name_and_vector.vector.apply(pandas.Series)
vectors.head()

In [None]:
vectors.values

In [None]:
genders = pandas.Series(truncated_name_and_gender['gender'])
just_binary_genders = np.array(genders)
just_binary_genders

In [None]:
train_data = vectors[:200].values
train_labels = just_binary_genders[:200]
test_data = vectors[200:].values
test_labels = just_binary_genders[200:]
train_data.shape

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(1, input_shape=(300,), activation='sigmoid'),
    ])
model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

In [None]:
model.fit(train_data, train_labels, verbose=2, epochs=100, validation_data=(test_data, test_labels))

In [None]:
name_and_vector.head()

In [None]:
def predict_one(vec):
    return model.predict(np.array([vec]))

In [None]:
test_predict = name_and_vector[200:].vector.apply(predict_one)
test_predict.head()

In [None]:
model.layers

In [None]:
weights = model.layers[0].get_weights()

In [None]:
weights_not_bias = weights[0]
weights_not_bias
weights_usable = weights_not_bias.T[0]
weights_usable

In [None]:
len(weights_usable)

In [None]:
plt.clf()

In [None]:
plt.scatter(range(0,300), weights_usable)
plt.show()

In [None]:
plt.hist(weights_usable)
plt.show()

In [None]:
np.array([weights_usable]).shape

In [None]:
import matplotlib as mpl

# Make a figure and axes with dimensions as desired.
fig = plt.figure(figsize=(8, 3))
ax = fig.add_axes([0.05, 0.475, 0.9, 0.15])


cmap = mpl.cm.hot
norm = mpl.colors.Normalize(vmin=5, vmax=10)

# If a ListedColormap is used, the length of the bounds array must be
# one greater than the length of the color list.  The bounds must be
# monotonically increasing.
bounds = [1, 2, 4, 7, 8]
norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
cb2 = mpl.colorbar.ColorbarBase(ax, cmap=cmap,
                                norm=norm,
                                ticks=bounds,  # optional
                                spacing='proportional',
                                orientation='horizontal')
cb2.set_label('Discrete intervals, some other units')
plt.show()