In [157]:
import os
import re
import numpy as np 
import pandas as pd
import tensorflow_text # must import this
import tensorflow as tf
import tensorflow_hub as hub
import preprocessor

In [158]:
from tqdm import tqdm
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.losses import BinaryCrossentropy
from keras import backend as K
# from transformers import TFAutoModel, AutoTokenizer 
from sklearn.model_selection import train_test_split
# from official.nlp import optimization

tqdm.pandas()

In [159]:

# load dataset
D = pd.read_csv('../inputs/go-emotions-google-emotions-dataset/go_emotions_dataset.csv')
D.head(3)

Unnamed: 0,id,text,example_very_unclear,admiration,amusement,anger,annoyance,approval,caring,confusion,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,eew5j0j,That game hurt.,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,eemcysk,>sexuality shouldn’t be a grouping category I...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ed2mah1,"You do right, if you don't care then fuck 'em!",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [160]:
# common spelling mistakes. 
S = pd.read_csv('../inputs/spelling/aspell.txt', sep=":",names=["correction","misspell"])
S['misspell'] = S['misspell'].str.strip()
S['misspell'] = S['misspell'].str.split(' ')
S = S.explode('misspell').reset_index(drop=True)
S = S.drop_duplicates('misspell')
S.head(3)

Unnamed: 0,correction,misspell
0,Nevada,nevade
1,Presbyterian,presbyterian
2,RSX,rsx


In [161]:
# replacing constraction i.e. shoudn't => shoud not
C = pd.read_csv('../inputs/contractions/contractions.csv')
C.columns = ['contraction', 'meaning']
C = C[['meaning', 'contraction']]
C.head(3)

Unnamed: 0,meaning,contraction
0,alright,'aight
1,is not,ain't
2,am not,amn't


In [162]:
mapChar = {
    '’': '\''
}
mapC = dict(zip(C['contraction'], C['meaning']))
mapS = dict(zip(S['misspell'], S['correction']))
punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''
mapP = dict(zip(list(punctuations), [""] * len(punctuations)))

def word_mapper(text, mapper):
    for word in text.split(' '):
        if word in mapper:
            text = text.replace(word, mapper[word])
    return text

def char_mapper(text, mapper):
    for k, v  in mapper.items():
        text = text.replace(k, v)
    return text

In [163]:
re_number = re.compile('[0-9]+')
re_url = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
re_tag = re.compile('\[[A-Z]+\]')
re_char = re.compile('[^0-9a-zA-Z\s?!.,:\'\"//]+')
re_char_clean = re.compile('[^0-9a-zA-Z\s?!.,\[\]]')
re_punc = re.compile('[?!,.\'\"]')


def clean_text(text):
    text = re.sub(re_char, "", text) # Remove unknown character
    text = char_mapper(text, mapChar) # Similar characters mapping
    text = word_mapper(text, mapC) # Remove contraction
    text = word_mapper(text, mapS) # Remove spelling mistakes

    text = re.sub(re_number, ' [number] ', text) # Replace number with tag
    text = re.sub(re_url, ' [url] ', text) # Replace URL with number

    text = re.sub(re_punc, lambda a: f" {a.group(0)} ", text) # Add space between punctuation
    text = preprocessor.clean(text) # Remove tweet clean

    text = re.sub(re_char_clean, "", text) # Only alphanumeric and punctuations.
    text = text.lower() # Lower text
    text = " ".join([w for w in text.split(' ') if w != " "]) # Remove whitespace

    return text

In [164]:
D['clean_text'] = D['text'].progress_apply(clean_text)
D.head(3)

 33%|███▎      | 70370/211225 [00:06<00:13, 10234.80it/s]


KeyboardInterrupt: 

In [None]:
emotions = [
  'admiration', 'amusement',
  'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity',
  'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment',
  'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love',
  'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse',
  'sadness', 'surprise', 'neutral',           
]

D = D[D['clean_text'] != '']
len(D)

211208

In [None]:
X = D['clean_text'].to_numpy()
y = D[emotions].to_numpy()

Xtr, Xte, ytr, yte = train_test_split(X, y, random_state=0, 
                                      test_size=0.2, shuffle=True)
Xtr.shape, Xte.shape, ytr.shape, yte.shape

((168966,), (42242,), (168966, 28), (42242, 28))

In [None]:
bert_model_name = 'small_bert/bert_en_uncased_L-2_H-128_A-2' 
bert_url = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1"
process_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

bert_process = hub.KerasLayer(process_url)
bert_layer = hub.KerasLayer(bert_url)

In [None]:
bert_layer(
    bert_process(['hello! this is great!'])
)

{'default': <tf.Tensor: shape=(1, 128), dtype=float32, numpy=
 array([[-0.9999857 ,  0.12610345, -0.99892527,  0.59368974, -0.99759835,
         -0.29068032, -0.99827355, -0.7794839 ,  0.21302669, -0.0515669 ,
         -0.89546645, -0.12966546, -0.05810805,  1.        , -0.8678081 ,
         -0.99703246,  0.91280806,  0.01160046, -0.88476884,  0.99767053,
          0.9285923 ,  0.06675816,  0.99494106,  0.8786861 , -0.99999475,
         -0.04982174, -0.99985886,  0.8758371 ,  0.99790245,  0.09025789,
         -0.03717682,  0.08605155, -0.9977154 , -0.71280843,  0.67639875,
          0.9999511 , -0.9197299 , -0.03752451,  0.93575305, -0.99979514,
          0.9801863 ,  0.9964594 , -0.9997192 ,  0.9532534 , -0.9999794 ,
         -0.09961445, -0.9998438 ,  0.9987609 ,  0.9503094 ,  0.9928909 ,
          0.9533225 , -0.8913298 , -0.08563032,  0.8767007 ,  0.99974805,
          0.9992685 , -0.9814322 , -0.29180354,  0.90239376, -0.89829767,
         -0.00844584,  0.5710738 , -0.82969713,  0

In [None]:
def build_classifier(bert_url, 
process_url, 
    output_size=28
):
    input_layer = Input(shape=(), dtype=tf.string, name='text')
    process_layer = hub.KerasLayer(process_url, name='process')
    bert_layer = hub.KerasLayer(bert_url, name='bert')

    output = bert_layer(process_layer(input_layer))
    pooled = output['pooled_output']
    
    net = Dropout(0.1)(pooled)
    net = Dense(output_size, activation=None, name='classifer')(net)
    
    return Model(input_layer, net)

In [None]:
model = build_classifier(
    bert_url, 
    process_url, 
    output_size=28
)

logit = model(tf.constant(['hello my name is jeongwon']))
tf.sigmoid(logit)

<tf.Tensor: shape=(1, 28), dtype=float32, numpy=
array([[0.8270866 , 0.16245994, 0.16185725, 0.80439925, 0.17765903,
        0.57906646, 0.58625025, 0.79262316, 0.29069227, 0.31320122,
        0.7350453 , 0.71015435, 0.32183486, 0.6838058 , 0.761464  ,
        0.51617026, 0.7275727 , 0.29083794, 0.6690375 , 0.15876535,
        0.84571725, 0.7418865 , 0.5963786 , 0.6508641 , 0.2652053 ,
        0.37027314, 0.29956365, 0.60552126]], dtype=float32)>

In [None]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 process (KerasLayer)           {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                

In [None]:
loss = BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5
batch_size = 32 

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy', 
    metrics=['binary_accuracy']
)



In [64]:
prior = y.mean(axis=0)
prior

array([0.08110962, 0.04377202, 0.03827507, 0.06447672, 0.08342487,
       0.02840328, 0.03484243, 0.04588841, 0.01807223, 0.04009791,
       0.05408886, 0.02509848, 0.01172304, 0.02665145, 0.01513674,
       0.05504053, 0.00318643, 0.03779686, 0.03878167, 0.00856975,
       0.04126264, 0.00616454, 0.04159407, 0.00610299, 0.01195504,
       0.03199689, 0.02610697, 0.26177512])

In [125]:
def average_f1_binary(y_true, logit):
    proba = tf.sigmoid(logit)
    y_pred = tf.cast((proba >= prior), tf.float32)
    
    tp = K.sum(K.round(K.clip(y_pred * y_true, 0, 1)), axis=1)
    
    possible_positive = K.sum(K.round(K.clip(y_true, 0, 1)), axis=1)
    predicted_positive = K.sum(K.round(K.clip(y_true, 0, 1)), axis=1)

    precision = tp / (possible_positive + K.epsilon())
    recall = tp / (predicted_positive + K.epsilon())

    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return K.mean(f1_val)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy', 
    metrics=['binary_accuracy', average_f1_binary]
)


In [146]:
def average_f1_binary(y_true, logit):
    proba = tf.sigmoid(logit)
    y_pred = tf.cast((proba >= prior), tf.float32)
    
    tp = K.sum(K.round(K.clip(y_pred * y_true, 0, 1)), axis=1)
    
    possible_positive = K.sum(K.round(K.clip(y_true, 0, 1)), axis=1)
    predicted_positive = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=1)

    precision = tp / (possible_positive + K.epsilon())
    recall = tp / (predicted_positive + K.epsilon())

    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    
    return K.mean(f1_val)




In [154]:
tf.test.is_gpu_available(cuda_only=True)

False

In [155]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [156]:
from tensorflow.python.platform import build_info as tf_build_info
print(tf_build_info.cuda_version_number)
# 9.0 in v1.10.0
print(tf_build_info.cudnn_version_number)

AttributeError: module 'tensorflow.python.platform.build_info' has no attribute 'cuda_version_number'

In [147]:

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy', 
    metrics=['binary_accuracy', average_f1_binary]
)

history = model.fit(Xtr,ytr, validation_split=0.2, epochs=epochs, batch_size=batch_size)
history

Epoch 1/5
 241/4225 [>.............................] - ETA: 7:15 - loss: 0.5640 - binary_accuracy: 0.9531 - average_f1_binary: 0.0773

KeyboardInterrupt: 