In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../')

In [2]:
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
from personflow.loader import PersonalityPostLoader
from personflow.preparator import PersonContainer, TrainTestSplitter

In [4]:
loader = PersonalityPostLoader()
persons = loader.run()

container = PersonContainer(persons)
container.split_posts(10)
container.replace_personality_codes()
container.replace_links()
container.replace_digits()
container.evenly_distribute()
container.post_joiner()

Ignored 2339 texts because of too small chunks


In [5]:
ttsplitter = TrainTestSplitter(container)
train_x, train_y, test_x, test_y = ttsplitter.run()

Train size: Counter({'Introvert': 13574, 'Extrovert': 13573})
Test size: Counter({'Extrovert': 5818, 'Introvert': 5817})


In [6]:
train_x[1]

'doll i love all the movies you listed, and this makes me think about why your tritype is one of my faves. the healing ability often attributed to the $digit might somehow benefit the $digit   there... abuse/abandonment can cause $digit it basically freezes us emotionally. thus the black and white simplicity, the overwhelming emotions and the controlling tendencies, it´s all symptoms of deeper... swordsman of mana i know it´s been a million years since this thread got started, but if you´re still interested, could you tell me what you think about the $digit tritype? i love reading your comments... hi guys!  fionn whitehead stars in nolan´s dunkirk as the lead. he´s a newcomer and not too well media trained, so his reactions seem pretty genuine and i think i got his type just from watching... enneagram stacking might play a bigger role than the type itself at times, like during the first-impression phase writers go to when introducing someone new.  i´d look into the sp/sx forum for...'

In [7]:
tokenizer = Tokenizer(oov_token='$oov')
tokenizer.fit_on_texts(train_x)
word_index = tokenizer.word_index

print(f"Training data contains {len(word_index)} different words.")

Training data contains 60797 different words.


In [8]:
import attr

@attr.s
class Sequencer:
    
    num_tokens = attr.ib()
    seq_len = attr.ib()
    oov = attr.ib(default='$oov')
    pad_type = attr.ib(default='post')
    trunc_type = attr.ib(default='post')
        
    def fit(self, X):    
        self.tokenizer = Tokenizer(num_words=self.num_tokens, oov_token=self.oov)
        self.tokenizer.fit_on_texts(X)
        self.word_index = self.tokenizer.word_index
    
    def transform(self, X):
        X_seq = self.tokenizer.texts_to_sequences(X)
        X_seq_pad = pad_sequences(
            X_seq, maxlen=self.seq_len, padding=self.pad_type, truncating=self.trunc_type)
        return X_seq_pad
    
    def fit_transform(self, X):
        self.fit(X)
        X_seq_pad = self.transform(X)
        return X_seq_pad

In [9]:
vocab_size = 40000
seq_len = 400

In [10]:
seq = Sequencer(num_tokens=vocab_size, seq_len=seq_len)
train_x_sp = seq.fit_transform(train_x)
test_x_sp = seq.transform(test_x)

In [11]:
training_padded = np.array(train_x_sp)
training_labels = np.array(train_y)
print(training_labels[:5])
_, training_labels = np.unique(training_labels, return_inverse=True)
print(training_labels[:5])

testing_padded = np.array(test_x_sp)
testing_labels = np.array(test_y)
_, testing_labels = np.unique(testing_labels, return_inverse=True)

['Introvert' 'Extrovert' 'Introvert' 'Extrovert' 'Extrovert']
[1 0 1 0 0]


In [12]:
embedding_dim = 16
epochs = 30

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=seq_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           640000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 640,433
Trainable params: 640,433
Non-trainable params: 0
_________________________________________________________________


In [14]:
num_epochs = 30
history = model.fit(
    training_padded, training_labels, epochs=num_epochs, 
    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/30
849/849 - 7s - loss: 0.6842 - accuracy: 0.5549 - val_loss: 0.6671 - val_accuracy: 0.5960
Epoch 2/30
849/849 - 7s - loss: 0.6165 - accuracy: 0.6658 - val_loss: 0.6666 - val_accuracy: 0.6003
Epoch 3/30
849/849 - 7s - loss: 0.5291 - accuracy: 0.7411 - val_loss: 0.7036 - val_accuracy: 0.6003
Epoch 4/30
849/849 - 7s - loss: 0.4504 - accuracy: 0.7904 - val_loss: 0.7700 - val_accuracy: 0.5863
Epoch 5/30
849/849 - 7s - loss: 0.3847 - accuracy: 0.8293 - val_loss: 0.8633 - val_accuracy: 0.5791
Epoch 6/30
849/849 - 7s - loss: 0.3286 - accuracy: 0.8594 - val_loss: 0.9457 - val_accuracy: 0.5795
Epoch 7/30
849/849 - 8s - loss: 0.2819 - accuracy: 0.8847 - val_loss: 1.0533 - val_accuracy: 0.5753
Epoch 8/30
849/849 - 8s - loss: 0.2464 - accuracy: 0.8993 - val_loss: 1.1752 - val_accuracy: 0.5738
Epoch 9/30
849/849 - 8s - loss: 0.2128 - accuracy: 0.9176 - val_loss: 1.2931 - val_accuracy: 0.5716
Epoch 10/30
849/849 - 8s - loss: 0.1844 - accuracy: 0.9298 - val_loss: 1.4170 - val_accuracy: 0.5674

In [16]:
sentences = [
    'This is so crazy you have to look at this haha. I can not believe it. This is so awesome. Nice lol.',
    'Had a nice day at the beach. link.',
]

sentences = np.array(seq.transform(sentences))
model.predict(sentences)

array([[0.00380972],
       [0.21285512]], dtype=float32)