In [1]:
import json
import yaml
import os
import operator
import copy
from array import array
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.models import load_model
from sklearn.model_selection import train_test_split
import pandas as pd

Using TensorFlow backend.


In [2]:
def import_bios(path):
    ids = []
    bio = []
    json_files = [pos_json for pos_json in os.listdir(path) if pos_json.endswith('.json')]
    for i in json_files:
        with open(path+i) as fp:  
            temp = json.load(fp)
            i = [i['_id'] for i in temp['results'] if i.get('bio')]
            b = [i.get('bio') for i in temp['results'] if i.get('bio')]
        ids+=i;
        bio+=b
    return ids,bio

In [5]:
path = '../../../output/tinder/Seatle/'
ids,bios = import_bios(path)

In [6]:
df = pd.DataFrame(list(zip(ids, bios)), 
               columns =['ID', 'Bio']) 
df.head()

Unnamed: 0,ID,Bio
0,5cec23f47ea01e1600e103f1,follow my insta🤩 \n@maddie_hosford
1,5cb554e9b4bddb1500513c1e,Witty sarcasm and an Amateur in a whole lot of...
2,5b0f8377abca8d5f5851c905,i’ll fill this out later
3,57048ac20c7642621b0fd3af,Vote for Summer✌️\nTell me your favorite Vine 👀
4,5ba5f8ab81da12b41ee0b81c,looking for someone with a truck so I can reen...


In [13]:
train_texts = df['Bio'].values
train_texts = [s.lower() for s in train_texts]
max_len = len(max(bios,key=len))

In [14]:
train_texts[:2]

['follow my insta\U0001f929 \n@maddie_hosford',
 'witty sarcasm and an amateur in a whole lot of trades\n\naspiring musician who loves good music']

In [65]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789"
vocab_size = len(alphabet)+1

char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1
tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

max_len = len(max(bios,key=len))
sequences = tk.texts_to_sequences(train_texts)
train_seq = pad_sequences(sequences, maxlen=max_len, padding='post')

In [66]:
X = copy.deepcopy(train_seq)
y = pad_sequences(train_seq[:,1:], maxlen=max_len, padding='post')

In [67]:
X = to_categorical(X)
y = to_categorical(y)
print(X.shape)
print(y.shape)

(1635, 500, 38)
(1635, 500, 38)


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [69]:
X_training,X_val,y_training,y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [70]:
#LSTM
hidden_size = 100;
model = Sequential()
# model.add(Embedding(max_len, hidden_size, input_shape = X.shape)
model.add(LSTM(38,batch_size = None,input_shape = (X.shape[1],X.shape[2],), return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(LSTM(hidden_size, return_sequences=True))
model.add(Dense(38, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 500, 38)           11704     
_________________________________________________________________
lstm_5 (LSTM)                (None, 500, 100)          55600     
_________________________________________________________________
lstm_6 (LSTM)                (None, 500, 100)          80400     
_________________________________________________________________
dense_2 (Dense)              (None, 500, 38)           3838      
Total params: 151,542
Trainable params: 151,542
Non-trainable params: 0
_________________________________________________________________
None


In [71]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X_training, y_training, epochs=100, verbose=2,validation_data=(X_val, y_val))

Train on 876 samples, validate on 219 samples
Epoch 1/100
 - 25s - loss: 1.7918 - acc: 0.7232 - val_loss: 0.9567 - val_acc: 0.7574
Epoch 2/100
 - 23s - loss: 0.9693 - acc: 0.7776 - val_loss: 0.8749 - val_acc: 0.8034
Epoch 3/100
 - 23s - loss: 0.9127 - acc: 0.7766 - val_loss: 0.7826 - val_acc: 0.7999
Epoch 4/100
 - 23s - loss: 0.7906 - acc: 0.7989 - val_loss: 0.7155 - val_acc: 0.8131
Epoch 5/100
 - 23s - loss: 0.7592 - acc: 0.8010 - val_loss: 0.7042 - val_acc: 0.8142
Epoch 6/100
 - 24s - loss: 0.7507 - acc: 0.8024 - val_loss: 0.6987 - val_acc: 0.8152
Epoch 7/100
 - 28s - loss: 0.7453 - acc: 0.8034 - val_loss: 0.6945 - val_acc: 0.8160
Epoch 8/100
 - 23s - loss: 0.7410 - acc: 0.8044 - val_loss: 0.6909 - val_acc: 0.8168
Epoch 9/100
 - 24s - loss: 0.7379 - acc: 0.8051 - val_loss: 0.6876 - val_acc: 0.8176
Epoch 10/100
 - 28s - loss: 0.7345 - acc: 0.8057 - val_loss: 0.6839 - val_acc: 0.8181
Epoch 11/100
 - 30s - loss: 0.7315 - acc: 0.8063 - val_loss: 0.6817 - val_acc: 0.8189
Epoch 12/100
 - 2

Epoch 96/100
 - 26s - loss: 0.4848 - acc: 0.8604 - val_loss: 0.4609 - val_acc: 0.8680
Epoch 97/100
 - 28s - loss: 0.4831 - acc: 0.8607 - val_loss: 0.4587 - val_acc: 0.8688
Epoch 98/100
 - 27s - loss: 0.4810 - acc: 0.8615 - val_loss: 0.4577 - val_acc: 0.8696
Epoch 99/100
 - 24s - loss: 0.4793 - acc: 0.8620 - val_loss: 0.4565 - val_acc: 0.8696
Epoch 100/100
 - 24s - loss: 0.4776 - acc: 0.8626 - val_loss: 0.4554 - val_acc: 0.8701


<keras.callbacks.History at 0x7f264fb8e4a8>

In [72]:
model.save('../../../models/char_model.h5')

In [73]:
score, acc = model.evaluate(X_test, y_test)
acc



0.8681851810879178

In [26]:
model = load_model('../../../models/char_model.h5')
model2 = Sequential()
model2.add(LSTM(38,batch_size = None,input_shape = (500,38,), return_sequences=True))
model2.add(LSTM(100,weights=model.layers[1].get_weights()))

In [11]:
def process_data(test):
    texts = [s.lower() for s in test]
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(texts)
    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789"
    vocab_size = len(alphabet)+1
    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1
    tk.word_index = char_dict.copy()
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1
    sequences = tk.texts_to_sequences(test)
    seq = pad_sequences(sequences, maxlen=max_len, padding='post')
    t = to_categorical(seq)
    return t

In [53]:
temp_test = train_texts[:2]
temp_test

['follow my insta\U0001f929 \n@maddie_hosford',
 'witty sarcasm and an amateur in a whole lot of trades\n\naspiring musician who loves good music']

In [54]:
temp = process_data(temp_test)

In [55]:
f = model2.predict(temp)
np.dot(f[0],f[1])/(np.linalg.norm(f[0])*np.linalg.norm(f[1])) #cos theta

0.99999994