In [2]:
import pandas as pd
import numpy as np 
import torch 
from sklearn.metrics import *
import nltk 
from Marbert.Preprocessing import clean_text
from sklearn.ensemble import RandomForestClassifier
from Marbert import BertClassifier
from scipy.special import softmax
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import os
from keras.preprocessing.text import tokenizer_from_json
import keras 
import json
from sklearn.model_selection import GridSearchCV as gs

In [4]:
def tokenize_data(text):
    word_list = nltk.word_tokenize(text)
    return ' '.join(word_list) 

In [9]:
data = pd.read_csv('Emotional-Tone-Dataset.csv')
data.dropna(inplace=True)
data.rename(columns={' TWEET':'tweet',' LABEL':'label'},inplace=True)
data.drop(columns=['ID'],inplace=True)
data['tweet'] = data['tweet'].astype(str)
data['tweet'] = data['tweet'].apply(clean_text)
data['tweet'] = data.tweet.apply(tokenize_data)
data.head()

Unnamed: 0,tweet,label
0,الاوليمبياد الجايه هكون لسه الكليه .,none
1,عجز الموازنه وصل ل93.7 الناتج المحلي يعني لسه ...,anger
2,كتنا نيله حظنا الهباب xD,sadness
3,جميعنا نريد تحقيق اهدافنا تونس تالقت حراسه الم...,joy
4,الاوليمبياد نظامها مختلف . ومواعيد المونديال م...,none


<h1>Encoding Labels</h1>
<span>same as MARBERT encoding</span>

In [10]:
Encoding_dict = {'none':0, 'anger':1, 'joy':2, 'sadness':3, 'love':4, 'sympathy':5, 'surprise':6, 'fear':7}
data['label'] = data['label'].map(Encoding_dict) 

In [11]:
data.head()

Unnamed: 0,tweet,label
0,الاوليمبياد الجايه هكون لسه الكليه .,0
1,عجز الموازنه وصل ل93.7 الناتج المحلي يعني لسه ...,1
2,كتنا نيله حظنا الهباب xD,3
3,جميعنا نريد تحقيق اهدافنا تونس تالقت حراسه الم...,2
4,الاوليمبياد نظامها مختلف . ومواعيد المونديال م...,0


<h1>Loading the Base Models<h1>

In [12]:
current_path = os.getcwd()
BertClassifier.initializeModel(os.path.join(current_path,'ModelsFiles/bert_classifier_model.pth'),os.path.join(current_path,'ModelsFiles/tokenizer.pkl'))
bertModel = BertClassifier.bertModel
bertTokenizer = BertClassifier.berTokenizer
biLstmModel = load_model('ModelsFiles/BiGRU_model.h5')
biGruModel = load_model('ModelsFiles/BiLSTM_model.h5')
tokenizer_config_file = "ModelsFiles/tokenizer_config.json"
with open(tokenizer_config_file, 'r') as f:
    tokenizer_config = json.load(f)
tokenizer_json = json.dumps(tokenizer_config)
bilstm_bigru_tokenizer = tokenizer_from_json(tokenizer_json)

No GPU available, using the CPU instead.



In [13]:
#HERE'S HOW TO REARRANGE ARRAY 
LSTM_GRU_MAP = [4,0,2,5,3,7,6,1]

In [14]:
x_train , x_test , y_train , y_test =  train_test_split(data['tweet'],data['label'],random_state=23,test_size=0.2)

In [15]:
X_LSTM_GRU_TRAIN = bilstm_bigru_tokenizer.texts_to_sequences(x_train.astype('str'))
X_LSTM_GRU_TRAIN = keras.preprocessing.sequence.pad_sequences(X_LSTM_GRU_TRAIN, padding='post', maxlen=300)
X_MARBERT_TRAIN = x_train
LABELS_TRAIN = y_train

In [27]:
lstm_predictions = biLstmModel.predict(X_LSTM_GRU_TRAIN)
gru_predictions = biGruModel.predict(X_LSTM_GRU_TRAIN)
marbert_predictions = [BertClassifier.predict(tweet).detach().numpy() for tweet in X_MARBERT_TRAIN]



<span><i>in case if you need to save them 
</i></span>

In [None]:
np.save("lstmpreds.npy",lstm_predictions)
np.save("grupreds.npy",gru_predictions)
np.save("bertpreds.npy",marbert_predictions)

let's inspect the marbert predictions


In [18]:
marbert_predictions[0]

array([-0.2781606 , -0.34234795,  0.14533697, -0.20055684, -0.83126587,
       -0.86984384,  2.4719045 , -0.39631644], dtype=float32)

<span>Marbert Outputs are obviously not a softmax layer output, it's just the final layer of marbert, so we need to apply softmax to it in order to ensure consistency among all outputs,<br> and also the lstm and gru labels positions in the softmax output are not the same  as the marbert model's labels' positions so we need to reorder the indicies or the positions of both lstm and gru to ensure that every thing is coherent and consistent</span>

In [23]:
marbert_predictions = np.squeeze(marbert_predictions)
marbert_predictions = softmax(marbert_predictions,axis=1)
for i in range(x_train.size):
    lstm_predictions[i][:] = lstm_predictions[i][LSTM_GRU_MAP]
    gru_predictions[i][:] = gru_predictions[i][LSTM_GRU_MAP]
    

<h3>now everything is consistent with each other and all the outputs have the same format, let's go on to build the meta model</h3>

In [24]:
x_RF_TRAIN = np.concatenate([marbert_predictions,lstm_predictions,gru_predictions],axis=1)
random_forest_classifier = RandomForestClassifier(n_estimators=50,random_state=23)
random_forest_classifier.fit(x_RF_TRAIN,LABELS_TRAIN)

now we're done, prepare the test data 

In [25]:
X_LSTM_GRU_TEST = bilstm_bigru_tokenizer.texts_to_sequences(x_test.astype('str'))
X_LSTM_GRU_TEST = keras.preprocessing.sequence.pad_sequences(X_LSTM_GRU_TEST, padding='post', maxlen=300)
X_MARBERT_TEST = x_test
LABELS_TEST = y_test

In [26]:
lstm_predictions = biLstmModel.predict(X_LSTM_GRU_TEST)
gru_predictions = biGruModel.predict(X_LSTM_GRU_TEST)
marbert_predictions = [BertClassifier.predict(tweet).detach().numpy() for tweet in X_MARBERT_TEST]



In [27]:
marbert_predictions = np.squeeze(marbert_predictions)
marbert_predictions = softmax(marbert_predictions,axis=1)
for i in range(x_test.size):
    lstm_predictions[i][:] = lstm_predictions[i][LSTM_GRU_MAP]
    gru_predictions[i][:] = gru_predictions[i][LSTM_GRU_MAP]

In [52]:
X_RF_TEST = np.concatenate([marbert_predictions,lstm_predictions,gru_predictions],axis=1)
rf_predections = random_forest_classifier.predict(X_RF_TEST)

In [29]:
print(rf_predections)

[4 2 5 ... 3 7 3]


In [53]:
accuracy = accuracy_score(rf_predections,LABELS_TEST)
print("ENSEMBLE MODELING ACCURACY : " , accuracy*100 ,"%")

ENSEMBLE MODELING ACCURACY :  88.37555886736214 %


In [54]:
rf_predections

array([4, 2, 5, ..., 3, 7, 3], dtype=int64)

In [37]:
LABELS_TEST = np.array(LABELS_TEST)

In [51]:
grf_predections

array([4, 2, 5, ..., 3, 7, 3], dtype=int64)

In [56]:
from sklearn.metrics._classification import classification_report
labels = ['none', 'anger', 'joy', 'sadness', 'love', 'sympathy', 'surprise', 'fear']
print(classification_report(y_true= LABELS_TEST,y_pred=rf_predections,target_names=labels,zero_division=1))

              precision    recall  f1-score   support

        none       0.84      0.87      0.85       321
       anger       0.89      0.90      0.90       269
         joy       0.89      0.81      0.85       266
     sadness       0.80      0.87      0.84       243
        love       0.88      0.91      0.89       249
    sympathy       0.97      0.95      0.96       214
    surprise       0.86      0.79      0.82       209
        fear       0.98      0.97      0.98       242

    accuracy                           0.88      2013
   macro avg       0.89      0.88      0.89      2013
weighted avg       0.89      0.88      0.88      2013

