## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import json
import os
import keras
from keras_bert import extract_embeddings
from keras_bert import load_trained_model_from_checkpoint
import codecs
from keras_bert import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed,Dense
from keras_contrib.layers import CRF
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

## GPU Setting

In [2]:
import tensorflow as tf
def select_gpu(N):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            # choose which gpu to use
            tf.config.experimental.set_visible_devices(gpus[N], 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)
select_gpu(0)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]
3 Physical GPUs, 1 Logical GPUs


## Load Data

In [3]:
json_list = list()
with open('tweets_DM.json' , 'r') as file:
    for line in file:
        json_list.append(json.loads(line))

In [4]:
tweet_list = list()
for json in json_list:
    tweet_id = json['_source']['tweet']['tweet_id']
    text = json['_source']['tweet']['text']
    tweet_list.append([tweet_id, text])

In [5]:
text_df = pd.DataFrame(tweet_list, columns=['tweet_id', 'text'])

In [6]:
emotion_df = pd.read_csv('emotion.csv')

In [7]:
identification_df = pd.read_csv('data_identification.csv')

In [8]:
public_test_df = identification_df[identification_df['identification'] == 'test']
public_test_df = public_test_df.merge(text_df, left_on='tweet_id', right_on='tweet_id')

In [9]:
train_df = text_df.merge(emotion_df, left_on='tweet_id', right_on='tweet_id')

In [10]:
train_df

Unnamed: 0,tweet_id,text,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
3,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,joy
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation
...,...,...,...
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy
1455560,0x2cbca6,there's currently two girls walking around the...,joy
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy


In [11]:
test_df=train_df.sample(frac=0.2)
test_df

Unnamed: 0,tweet_id,text,emotion
371652,0x2ba271,@ladyzee70 I can't wrap my head around how thi...,sadness
1215559,0x252997,Going to counselling today despite the cold an...,trust
141657,0x37dc9e,#Routines are #never quite the #same #all the ...,surprise
1012673,0x33db3c,What the funk <LH> no sex scenes in the first ...,disgust
1004982,0x286709,@WFDirect popped into @follieskent #Hythe for ...,anticipation
...,...,...,...
333621,0x212664,I want a fucking Fresca. #Fresca <LH>,anticipation
149726,0x38028d,"And through my worries and fears, my prayers s...",joy
967835,0x36be02,"Natural disasters, shootings and family death....",anticipation
1350815,0x1f79aa,#Gm world thanking my #Heavenly <LH> 4another ...,anticipation


In [12]:
train_df1=train_df.append(test_df)
train_df1.drop_duplicates(keep=False,inplace=True)
train_df1

Unnamed: 0,tweet_id,text,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",anticipation
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",sadness
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,fear
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,anticipation
5,0x368e95,Love knows no gender. 😢😭 <LH>,joy
...,...,...,...
1455557,0x30cc9c,Waking up with only a slight headache after bl...,joy
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,joy
1455559,0x38959e,In every circumtance I'd like to be thankful t...,joy
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",joy


In [13]:
public_test_df.head()

Unnamed: 0,tweet_id,identification,text
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne..."
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...


## Data preprocessing

In [14]:
from sklearn.preprocessing import LabelEncoder
#encode train emotion 
labelencoder = LabelEncoder()
train_df1['emotion'] = labelencoder.fit_transform(train_df1['emotion'])
train_df1

Unnamed: 0,tweet_id,text,emotion
0,0x376b20,"People who post ""add me on #Snapchat"" must be ...",1
1,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...",5
2,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,3
4,0x2c91a8,Still waiting on those supplies Liscus. <LH>,1
5,0x368e95,Love knows no gender. 😢😭 <LH>,4
...,...,...,...
1455557,0x30cc9c,Waking up with only a slight headache after bl...,4
1455558,0x321566,I'm SO HAPPY!!! #NoWonder the name of this sho...,4
1455559,0x38959e,In every circumtance I'd like to be thankful t...,4
1455561,0x24faed,"Ah, corporate life, where you can date <LH> us...",4


In [15]:
test_df['emotion'] = labelencoder.fit_transform(test_df['emotion'])
test_df

Unnamed: 0,tweet_id,text,emotion
371652,0x2ba271,@ladyzee70 I can't wrap my head around how thi...,5
1215559,0x252997,Going to counselling today despite the cold an...,7
141657,0x37dc9e,#Routines are #never quite the #same #all the ...,6
1012673,0x33db3c,What the funk <LH> no sex scenes in the first ...,2
1004982,0x286709,@WFDirect popped into @follieskent #Hythe for ...,1
...,...,...,...
333621,0x212664,I want a fucking Fresca. #Fresca <LH>,1
149726,0x38028d,"And through my worries and fears, my prayers s...",4
967835,0x36be02,"Natural disasters, shootings and family death....",1
1350815,0x1f79aa,#Gm world thanking my #Heavenly <LH> 4another ...,1


## Load Bert Dictionary

In [16]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tensorflow_hub as hub
import tokenization

In [None]:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

## Encode and Decode Process

In [18]:
def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

In [19]:
def bert_encode(texts, tokenizer, max_len=50):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [21]:
max_len = 50
train_input = bert_encode(train_df1.text.values, tokenizer, max_len=max_len)
test_input  = bert_encode(test_df.text.values, tokenizer, max_len=max_len)
public_test_input = bert_encode(public_test_df.text.values, tokenizer, max_len=max_len)


In [22]:
train_labels = tf.onclick=keras.utils.to_categorical(train_df1.emotion.values, num_classes=8)
test_labels  =tf.onclick=keras.utils.to_categorical(test_df.emotion.values, num_classes=8)

## Build BERT Model

In [20]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(8, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [23]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 50)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 50)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 50)]         0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]      

In [24]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model/model_3.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input,
    train_labels,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint, earlystopping],
    batch_size=32)

Epoch 1/4
Epoch 00001: val_accuracy improved from -inf to 0.63994, saving model to model/model_4.h5
Epoch 2/4
Epoch 00002: val_accuracy improved from 0.63994 to 0.65543, saving model to model/model_4.h5
Epoch 3/4
  536/29112 [..............................] - ETA: 1:06:41 - loss: 0.9211 - accuracy: 0.6767

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 00003: val_accuracy improved from 0.65543 to 0.66207, saving model to model/model_4.h5
Epoch 4/4
Epoch 00004: val_accuracy improved from 0.66207 to 0.66268, saving model to model/model_4.h5


## Evaluate  Model

In [26]:
# save model
model_json = model.to_json()
with open("model/model_3.json", "w") as json_file:
    json_file.write(model_json)

In [27]:
model.load_weights('model/model_3.h5')
test_pred = model.predict(public_test_input,batch_size=32)

In [28]:
# decode prediction results into labels
y_pred = label_decode(labelencoder, test_pred)

In [29]:
public_test_df['predict'] = y_pred

In [30]:
public_test_df

Unnamed: 0,tweet_id,identification,text,predict
0,0x28cc61,test,@Habbo I've seen two separate colours of the e...,joy
1,0x2db41f,test,@FoxNews @KellyannePolls No serious self respe...,sadness
2,0x2466f6,test,"Looking for a new car, and it says 1 lady owne...",disgust
3,0x23f9e9,test,@cineworld “only the brave” just out and fount...,disgust
4,0x1fb4e1,test,Felt like total dog 💩 going into open gym and ...,joy
...,...,...,...,...
411967,0x2c4dc2,test,6 year old walks in astounded. Mum! Look how b...,surprise
411968,0x31be7c,test,Only one week to go until the #inspiringvolunt...,anticipation
411969,0x1ca58e,test,"I just got caught up with the manga for ""My He...",sadness
411970,0x35c8ba,test,Speak only when spoken to and make hot ass mus...,joy


In [31]:
output_df = public_test_df[['tweet_id', 'predict']]
output_df = output_df.rename(columns={'tweet_id':'id', 'predict':'emotion'})
# output_df

In [32]:
output_df.to_csv('Bert_epoch_3.csv', index=False, header=True)