In [1]:
import re
import string

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
df = pd.read_csv("/home/jason/mbti_model/mbti_1.csv")
df.head()
df.shape

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


(8675, 2)

In [3]:
types = np.unique(df.type.values)
def get_type_index(string):
    return list(types).index(string)

df['type_index'] = df['type'].apply(get_type_index)

In [4]:
def clean_text(text):
    regex = re.compile("[%s]" % re.escape("|"))
    text = regex.sub(" ", text)
    # Remove url links
    text = re.sub(
        "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        " ",
        text,
    )
    # Remove Non-words - keep only words
    text = re.sub("[^a-zA-Z]", " ", text)

    # Remove spaces > 1
    text = re.sub(" +", " ", text).lower()

    # Remove multiple letter repeating words
    text = re.sub(r"([a-z])\1{2,}[\s|\w]*", "", text)

    words = str(text).split()
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    words = words.translate(words.maketrans("", "", string.punctuation))
    return words


df["cleaned_text"] = df["posts"].apply(clean_text)

In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)
train, val = train_test_split(train)

In [6]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 10000
trunc_type = "post"
pad_type = "post"
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df.cleaned_text.values)

2021-07-23 16:02:05.301223: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-23 16:02:05.301262: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [7]:
maxlen = 1500
train_sequences = tokenizer.texts_to_sequences(train.cleaned_text.values)
train_padded = pad_sequences(
    train_sequences, maxlen=maxlen, truncating=trunc_type, padding=pad_type
)

val_sequences = tokenizer.texts_to_sequences(val.cleaned_text.values)
val_padded = pad_sequences(
    val_sequences, maxlen=maxlen, truncating=trunc_type, padding=pad_type
)

In [8]:
train_padded

array([[   9,   68,   65, ...,    0,    0,    0],
       [  28,  185,   18, ...,    0,    0,    0],
       [ 130,    8,   15, ...,    0,    0,    0],
       ...,
       [3850, 1681,  898, ...,    0,    0,    0],
       [ 581,    1,  890, ...,    0,    0,    0],
       [  66,    2,  316, ...,  515,  178,  875]], dtype=int32)

In [9]:
one_hot_labels = tf.keras.utils.to_categorical(train.type_index.values, num_classes=16)
val_labels= tf.keras.utils.to_categorical(val.type_index.values, num_classes=16)

In [10]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-large-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [11]:
maxlen = 1500

train_input_ids = [
    tokenizer.encode(str(i), max_length=maxlen, pad_to_max_length=True)
    for i in train.cleaned_text.values
]
val_input_ids = [
    tokenizer.encode(str(i), max_length=maxlen, pad_to_max_length=True)
    for i in val.cleaned_text.values
]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
def create_model():
    input_word_ids = tf.keras.layers.Input(
        shape=(maxlen,), dtype=tf.int32, name="input_word_ids"
    )
    bert_layer = transformers.TFBertModel.from_pretrained("bert-large-uncased")
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation="softmax")(bert_outputs[:, 0, :])

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        loss="categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        metrics=["accuracy"],
    )
    return model

In [13]:
use_tpu = False
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()

model.summary()

Downloading:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

2021-07-23 16:04:28.827214: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-07-23 16:04:28.827310: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-07-23 16:04:28.827358: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ubuntu-s-4vcpu-8gb-nyc1-01): /proc/driver/nvidia/version does not exist
2021-07-23 16:04:28.828013: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBe

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 1500)]            0         
_________________________________________________________________
tf_bert_model (TFBertModel)  

In [14]:
batch_size = 16

model.fit(
    np.array(train_input_ids),
    one_hot_labels,
    validation_data=(np.array(val_input_ids), val_labels),
    verbose=1,
    epochs=20,
    batch_size=batch_size,
    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)],
)

Epoch 1/20


2021-07-23 16:05:13.572166: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-07-23 16:05:13.572845: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2494135000 Hz




InvalidArgumentError:  indices[0,658] = 658 is not in [0, 512)
	 [[node model/tf_bert_model/bert/embeddings/Gather_1 (defined at home/jason/.local/lib/python3.8/site-packages/transformers/models/bert/modeling_tf_bert.py:198) ]] [Op:__inference_train_function_53315]

Errors may have originated from an input operation.
Input Source operations connected to node model/tf_bert_model/bert/embeddings/Gather_1:
 model/tf_bert_model/bert/embeddings/ExpandDims (defined at home/jason/.local/lib/python3.8/site-packages/transformers/models/bert/modeling_tf_bert.py:196)

Function call stack:
train_function
