Google Drive Auth

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Data Pipeline**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 46.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installati

In [None]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import regex as re
import transformers
from keras import backend as K
import plotly.express as px

data=pd.read_csv('/content/gdrive/My Drive/dataset/mbti_1.csv')
data=data.iloc[:4000]
data.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [None]:
data['type'].value_counts()

INFP    841
INFJ    693
INTP    583
INTJ    508
ENTP    331
ENFP    291
ISTP    170
ISFP    115
ENFJ     95
ENTJ     95
ISTJ     88
ISFJ     81
ESTP     49
ESFP     27
ESTJ     18
ESFJ     15
Name: type, dtype: int64



**This dataset contains quite a lot of URLs and symbols, so let's go ahead and remove those.**

In [None]:
def clean_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        # Remove |||
        sentence=sentence.replace('|||',"") 

        # Remove URLs, links etc
        sentence = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', sentence, flags=re.MULTILINE) 

        # Remove puntuations 
        puncs1=['@','#','$','%','^','&','*','(',')','-','_','+','=','{','}','[',']','|','\\','"',"'",';',':','<','>','/']
        for punc in puncs1:
            sentence=sentence.replace(punc,'') 

        puncs2=[',','.','?','!','\n']
        for punc in puncs2:
            sentence=sentence.replace(punc,' ')
             
        # Remove extra white spaces
        sentence=re.sub( '\s+', ' ', sentence ).strip()
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
        
    return cleaned_text

In [None]:
data.posts = clean_text(data)
data

  0%|          | 0/4000 [00:00<?, ?it/s]

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top ten play...
1,ENTP,im finding the lack of me in these posts very ...
2,INTP,good one course to which i say i know thats my...
3,INTJ,dear intp i enjoyed our conversation the other...
4,ENTJ,youre fired thats another silly misconception ...
...,...,...
3995,ISTP,i guess i should leave a bye or something i wo...
3996,INTJ,you have the sexiest mortons toe ive ever seen...
3997,ISTP,such pretty bras too bad they dont fit me the ...
3998,INTJ,hmm in terms of enhancing your mature i would ...


In [None]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [None]:
data['type_index'] = data['type'].apply(get_type_index)
data

Unnamed: 0,type,posts,type_index
0,INFJ,and intj moments sportscenter not top ten play...,8
1,ENTP,im finding the lack of me in these posts very ...,3
2,INTP,good one course to which i say i know thats my...,11
3,INTJ,dear intp i enjoyed our conversation the other...,10
4,ENTJ,youre fired thats another silly misconception ...,2
...,...,...,...
3995,ISTP,i guess i should leave a bye or something i wo...,15
3996,INTJ,you have the sexiest mortons toe ive ever seen...,10
3997,ISTP,such pretty bras too bad they dont fit me the ...,15
3998,INTJ,hmm in terms of enhancing your mature i would ...,10


**Initialize BERT Tokenizer and attention masks******

In [None]:
#Split dataset
from sklearn.model_selection import train_test_split

posts = data['posts'].values
labels =  data['type'].values
train_data, test_data = train_test_split(data, random_state=0, test_size=0.2)

train_size = len(train_data)
test_size = len(test_data)
train_size, test_size

(3200, 800)

In [None]:
#Initialize Bert tokenizer and masks
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 64
# Check this, this helped us minimize our compute time 
def tokenize_sentences(sentences, tokenizer, max_seq_len = 64):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

train_input_ids = tokenize_sentences(train_data['posts'], tokenizer, MAX_LEN)
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post" )
train_attention_masks = create_attention_masks(train_input_ids)

test_input_ids = tokenize_sentences(test_data['posts'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

  0%|          | 0/3200 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/800 [00:00<?, ?it/s]

In [None]:
BATCH_SIZE=10 # for computation reasons
NR_EPOCHS= 10

# BERT Model
* Load the pretrained BERT base-model from Transformers library
- Take the first hidden-state from BERT output (corresponding to CLS token) and feed it into a Dense layer with 16 neurons and softmax activation

In [None]:
#Define f1 functions for evaluation
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [None]:
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.compat.v1.keras.backend import set_session
tf.keras.backend.clear_session()  # For easy reset of notebook state.

config_proto = tf.compat.v1.ConfigProto()
off = rewriter_config_pb2.RewriterConfig.OFF
config_proto.graph_options.rewrite_options.arithmetic_optimization = off
session = tf.compat.v1.Session(config=config_proto)
set_session(session)

In [None]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00002), metrics=['accuracy', f1_m, precision_m, recall_m])
    return model

In [None]:
model = create_model()    
model.summary()

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 64)]              0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 335141888 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 16)                16400     
Total params: 335,158,288
Trainable params: 335,158,288
Non-trainable params: 0
_________________________________________________________________


In [None]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [None]:
train_data['type_index'] = data['type'].apply(get_type_index)
train_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,type,posts,type_index
1161,ENTJ,im so hesitant to be completely open with how ...,2
2355,INTJ,bookshelf pornas a nonamerican please excuse m...,10
1831,INTJ,time to bump and see what the great estp presi...,10
156,INTJ,it would seem that i can relate to bits of bot...,10
195,INTJ,you guys in general are speaking nonsense in p...,10
...,...,...,...
835,INTJ,or the adams family butler think his names lur...,10
3264,ENFJ,hi entp and welcome to the forum winkfrom what...,0
1653,INFP,o i didnt think it was trolling i thought it w...,9
2607,INFP,hello all ive recently retaken the jungian per...,9


In [None]:
one_hot_labels = tf.keras.utils.to_categorical(train_data.type_index.values, num_classes=16)

In [None]:
model.fit(np.array(train_input_ids), one_hot_labels, verbose = 1, epochs = NR_EPOCHS, batch_size = BATCH_SIZE,  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f7ee34caf10>



**Run test and evaluate accuracy**

In [None]:
test_data['type_index'] = data['type'].apply(get_type_index)
test_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,type,posts,type_index
2230,INFJ,no in my case i cant do that either there is n...,8
668,INFP,so my boyfriend died of an od three days ago i...,9
3616,INTJ,omg why we are old258274 nope the earth totall...,10
2363,INFP,i know i know i thought about it then felt bad...,9
142,ENFP,same story hereim feeling it a bit right now i...,1
...,...,...,...
1118,ISTP,for me its less about type and more about inte...,15
3572,INFJ,haha yes that is definitely a principle that i...,8
2482,INFJ,cant believe how old this thread is but i thou...,8
643,ENFP,haha to some extent i mean i get along with ev...,1


In [None]:
test_labels = tf.keras.utils.to_categorical(test_data.type_index.values, num_classes=16)

In [None]:
model.evaluate(np.array(test_input_ids), test_labels)



[4.308982849121094,
 0.27000001072883606,
 0.2626461088657379,
 0.28534477949142456,
 0.24375000596046448]

**The accuracy is not great, **as** the dataset is very imbalanced. Which causes it to be an overfitted model. Hence, the difference between training score and testing score.**

In [None]:
cols = data['type'].unique()
cols = cols.tolist()

colnames = ['sentence']
colnames = colnames+cols


In [None]:

df_predict = pd.DataFrame(columns = colnames)
sentence = "Time to debate on it. Strike at the weakest point and make others cry with facts"

df_predict.loc[0, 'sentence'] = sentence

In [None]:
sentence_inputs = tokenize_sentences(df_predict['sentence'], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0, cols] = prediction

df_predict

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,sentence,INFJ,ENTP,INTP,INTJ,ENTJ,ENFJ,INFP,ENFP,ISFP,ISTP,ISFJ,ISTJ,ESTP,ESFP,ESTJ,ESFJ
0,Time to debate on it. Strike at the weakest po...,0.0429121,0.0953053,0.0127935,0.0211361,0.0389309,0.0203568,0.0520737,0.00667667,0.00354101,0.260125,0.0295566,0.0114677,0.112819,0.0088805,0.036219,0.247206
