In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
import tensorflow.keras.layers as layers
from tensorflow.keras import Input
import tensorflow_hub as hub
from nltk.corpus import stopwords

In [5]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embedding_model = hub.load(module_url)
def embed(input):
    return embedding_model(input)

In [7]:
data = pd.read_csv("../data/mbti_1.csv")
sen_extra = []
sen_posts = []
sen_nature = []
number_posts = 10
types = pd.unique(data['type'])
code = {tp:i for i,tp in enumerate(types)}
rev_code  = {i:tp for i,tp in enumerate(types)}

for index, row in data.iterrows(): 
    if len(row['posts'].split('|||')) < 10:
        continue
    embedded_posts = embed(row['posts'].split('|||')[:number_posts])
    labels_extra = [(0 if 'E' in row['type'] else 1)] * number_posts
    labels_nature = [(0 if 'T' in row['type'] else 1)] * number_posts
    post = [post.numpy() for post in embedded_posts]
    sen_extra.extend(labels_extra)
    sen_nature.extend(labels_nature)
    sen_posts.extend(post)

In [8]:
embedding_dim = 512

inputs = tf.keras.Input(shape=(embedding_dim,))

dense1_1 = layers.Dense(512, activation="relu")(inputs)
drop1_1 = layers.Dropout(.2)(dense1_1)
dense1_2 = layers.Dense(256, activation='relu')(drop1_1)
drop1_2 = layers.Dropout(.2)(dense1_2)
dense1_3 = layers.Dense(64, activation="relu")(drop1_2)
drop1_3 = layers.Dropout(.2)(dense1_3)

dense2_1 = layers.Dense(512, activation="relu")(inputs)
drop2_1 = layers.Dropout(.2)(dense2_1)
dense2_2 = layers.Dense(256, activation='relu')(drop2_1)
drop2_2 = layers.Dropout(.2)(dense2_2)
dense2_3 = layers.Dense(64, activation="relu")(drop2_2)
drop2_3 = layers.Dropout(.2)(dense2_3)

output1 = layers.Dense(1, activation='sigmoid')(drop1_3)
output2 = layers.Dense(1, activation='sigmoid')(drop2_3)

model = tf.keras.Model(inputs, outputs= [output1, output2])


model.compile(
    optimizer='rmsprop',
    loss=['binary_crossentropy','binary_crossentropy'],
    metrics=[
        'accuracy'
    ]
)

history = model.fit(
    np.stack(sen_posts),
    [np.array(sen_extra),np.array(sen_nature)],
    batch_size=32,
    validation_split=.1,
    epochs=10
#     callbacks=[
#         tf.keras.callbacks.ModelCheckpoint('./sentence_multi_out_model.h5', save_best_only=True, save_weights_only=True)
#     ]
)

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("jsweights.h5")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
