In [1]:
import os, zipfile
import tensorflow as tf
import pandas as pd
import demoji
import csv
import re
import string
import pickle

2024-04-23 16:09:47.052137: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# extracting files
zip_path = 'archive.zip'
extract_dir = './datasets'

with zipfile.ZipFile(zip_path, 'r') as zip_file:
    zip_file.extractall(extract_dir)


ds_path = './datasets/'


#check data
df_train_ds = pd.read_csv(ds_path+'twitter_training.csv', names=['ID','user','SC','Comment'])
df_test_ds = pd.read_csv(ds_path+'twitter_validation.csv', names=['ID','user','SC','Comment'])
set(df_train_ds['SC']) # irrelevant, neutral, negative, positive
df_train_ds.iloc[11594]

ID                              13192
user                    Xbox(Xseries)
SC                           Positive
Comment    Lol. Nice! Now I want one.
Name: 11594, dtype: object

Previous line "Lol. Nice! Now I want one" shows that the pandas it is not taking the actually emoji. However, the data.TextLineDataset is it actually. However it is not taking the emoji itself but the USCII code instead. What should I do in this case. I prefer interfering and generating a data replacement for adding more context to the sentences based on the emoji expression (sentiment analysis). Something called [MASK] in tensorflow terms, not equal, but relating, creating a "mask" or a replacement over some objects in the data in this case for proportioning more information about the emotions from the prompter.

In [5]:
import json

# import the json file
with open('emoji-unicode.json') as json_file:
    emoji_dict= json.load(json_file)

# remove the unnecessary keys and values from the dictionary 
trash_keys = ['no',
              'emoji',
              'flagged',
              'keywords']

for trash_key in trash_keys:
    for _ in range(len(emoji_dict['emojis'])):
        emoji_dict['emojis'][_].pop(trash_key,None)

In [6]:
# processing the format to unicode
def format_to_unicode(s):
    code_point = int(s[2:], 16)
    return chr(code_point)

list_emoji_dict = emoji_dict['emojis']
len_list_emoji_dict = len(list_emoji_dict)
for _ in range(len_list_emoji_dict):
    for code, description in list_emoji_dict[_].items():
        emoji_uni = list_emoji_dict[_]['code']
        emoji_desc = list_emoji_dict[_]['description']
        df_train_ds['Comment'] = df_train_ds['Comment'].str.replace(format_to_unicode(emoji_uni), emoji_desc)
        df_test_ds['Comment'] = df_test_ds['Comment'].str.replace(format_to_unicode(emoji_uni), emoji_desc)

In [7]:
df_test_ds

Unnamed: 0,ID,user,SC,Comment
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


Once the dataset emoji mask is set, we can export the csv for including it into the general model. Indeed, if we want to, we can do the preprocessing job first including removing stoping words punctuation that might improve the performance in our model in this case.

In [63]:
#exporting the processed dataset into a csv file
df_train_ds.to_csv('processed_twitter_train.csv', index=False, header=False) 
df_test_ds.to_csv('processed_twitter_test.csv', index=False, header=False) 

In [14]:
def filter_dataset(line):
    def lower_fn(line):
        return tf.strings.lower(line)

    lower_line = tf.py_function(lower_fn, [line], tf.string)
    split_line = tf.strings.split(lower_line,',', maxsplit=3) # just three items in the tensor
    sentiment_category = split_line[3] # irrelevant, neutral, negative, positive

    return (True if sentiment_category != 'Irrelevant' else False)

# converting the data into tensor data using the filter
train_ds = tf.data.TextLineDataset('processed_twitter_train.csv').filter(filter_dataset)
test_ds = tf.data.TextLineDataset('processed_twitter_test.csv').filter(filter_dataset)

In [15]:
for e in train_ds.skip(1).take(2):
    print(e)

2024-04-23 11:37:13.000771: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2024-04-23 11:37:13.047795: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: input must be a vector, got shape: []
	 [[{{node StringSplit/StringSplitV2}}]]
2024-04-23 11:37:13.052795: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: input must be a vector, got shape: []
	 [[{{node StringSplit/StringSplitV2}}]]


InvalidArgumentError: {{function_node __wrapped__IteratorGetNext_output_types_1_device_/job:localhost/replica:0/task:0/device:CPU:0}} input must be a vector, got shape: []
	 [[{{node StringSplit/StringSplitV2}}]] [Op:IteratorGetNext]

In [69]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [76]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [80]:
# build vocabulary and save it to vocabulary.obj
vocabulary = vectorize_layer.adapt(train_ds)
vocab_file = open('vocabulary.obj','wb')
pickle.dump(vocabulary,vocab_file)

In [None]:
# # for loading the vocabulary
# vocab_file = open('vocabulary.obj','rb')
# vocabulary = pickle.load(vocab_file)


In [81]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label