In [62]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df_simple = pd.read_csv('trn_cleaned.csv')

In [4]:
df_simple = df_simple[df_simple['offensiveYN'].notna()]

In [5]:
df_agg = df_simple.groupby(by=["clean_post"])['offensiveYN'].agg(lambda x:pd.Series.mode(x)[0]).reset_index()

In [6]:
df_agg

Unnamed: 0,clean_post,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",1.0
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,0.0
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,1.0
3,\nDavid Knight‏ \n\nNotice how quickly things ...,0.0
4,\nFinland fireball: Time-lapse video shows nig...,0.0
...,...,...
35414,👉 Illegally in the country after 5 deportation...,0.0
35415,💥Breaking💥\nJulian Assange is the gate keeper ...,0.0
35416,📖 2Kings 22:19 because your heart was peniten...,0.0
35417,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,0.0


### Explore emoji representation by translating emoji to english word using package and out sourced word_emoji matching dictionary

In [7]:
!pip install emot

Collecting emot
  Downloading emot-3.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 17 kB/s  eta 0:00:011
[?25hInstalling collected packages: emot
Successfully installed emot-3.1
You should consider upgrading via the '/home/hwu24/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
import re
import pickle
from emot.emo_unicode import EMOJI_UNICODE, UNICODE_EMOJI

In [10]:
with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

In [17]:
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
    return text

In [14]:
text = '\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6'

In [18]:
convert_emojis(text)

'\n\nRose\nroseTaylor\u200f @RealRoseTaylor 6h6'

In [16]:
convert_emojis_to_word(text)

'\n\nRose\nroseTaylor\u200f @RealRoseTaylor 6h6'

In [23]:
df_agg

Unnamed: 0,clean_post,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",1.0
1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,0.0
2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,1.0
3,\nDavid Knight‏ \n\nNotice how quickly things ...,0.0
4,\nFinland fireball: Time-lapse video shows nig...,0.0
...,...,...
35414,👉 Illegally in the country after 5 deportation...,0.0
35415,💥Breaking💥\nJulian Assange is the gate keeper ...,0.0
35416,📖 2Kings 22:19 because your heart was peniten...,0.0
35417,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,0.0


In [27]:
df_agg['post_emo_replace'] = df_agg['clean_post'].map(lambda x: convert_emojis_to_word(x))

In [29]:
#df_agg.to_csv('no_emoji.csv')

In [24]:
# start  = time.time()
# x = [convert_emojis_to_word(text) for text in df_agg['clean_post'][:1000]]
# end = time.time()
# end-start

106.68374061584473

In [20]:
import time

In [21]:
time.time()

1637093986.8992023

In [30]:
sentences = df_agg['post_emo_replace'].values

In [31]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
le = LabelEncoder()
y = le.fit_transform(df_agg['offensiveYN'].values)

In [32]:
sentences_train,sentences_test,y_train,y_test = train_test_split(
                                                sentences, y,  
                                                test_size=0.20,  
                                                random_state=7)

In [33]:
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

In [34]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

In [35]:
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

In [36]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [37]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [73]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                                        vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [39]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    epochs=5,
                    validation_data=(X_test, y_test),
                    batch_size=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Explore model structure by feeding  demographic features
- Utilize text concatenation to combine post and categorical demographic data and feed the concatenated version of text to embedding layer for feature extraction and model learning

In [2]:
df_trn = pd.read_csv('SBF_trn.csv')

In [3]:
df_full = df_trn[['post','annotatorGender','annotatorRace','annotatorAge','offensiveYN']].copy()

In [4]:
df_full.shape

(112900, 5)

In [5]:
pattern = '^RT.*: '
pattern_2 ='&#[^a-zA-Z]+;$'
#re.sub(pattern_2,'',re.sub(pattern,'',x))

In [6]:
df_full['clean_post']=[re.sub(pattern_2,'',re.sub(pattern,'',x)) for x in df_full['post']]

In [7]:
df_full = df_full[df_full['offensiveYN'].notna()]

In [8]:
df_full['label']= [x if x!=0.5 else 3 for x in df_full['offensiveYN']]

In [9]:
df_full.shape

(110883, 7)

In [10]:
df_full_agg = df_full.groupby(by=["clean_post",'annotatorGender','annotatorRace','annotatorAge','offensiveYN']).count().reset_index()

In [11]:
df_full_agg.shape

(88822, 7)

In [19]:
df_full_agg = df_full.groupby(by=["clean_post",'annotatorGender','annotatorRace','annotatorAge'])['offensiveYN'].agg(lambda x:pd.Series.mode(x)[0]).reset_index()

In [20]:
df_full_agg.head()

Unnamed: 0,clean_post,annotatorGender,annotatorRace,annotatorAge,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",man,white,41.0,1.0
1,"\n\nBill Kristol and Ben Shaprio, two turds in...",man,white,42.0,1.0
2,"\n\nBill Kristol and Ben Shaprio, two turds in...",woman,white,39.0,1.0
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,man,white,25.0,0.0
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,woman,white,30.0,0.0


In [14]:
df_full_agg['annotatorGender']= [' '+ x for x in df_full_agg['annotatorGender']]
df_full_agg['annotatorRace']= [' '+ x for x in df_full_agg['annotatorRace']]

In [15]:
df_full_agg['concate']= df_full_agg['clean_post']+df_full_agg['annotatorGender']+df_full_agg['annotatorRace']

In [16]:
df_full_agg =df_full_agg[['concate','annotatorAge','offensiveYN']]

In [17]:
df_full_agg.shape

(88465, 3)

In [18]:
df_full_agg.head()

Unnamed: 0,concate,annotatorAge,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",41.0,1.0
1,"\n\nBill Kristol and Ben Shaprio, two turds in...",42.0,1.0
2,"\n\nBill Kristol and Ben Shaprio, two turds in...",39.0,1.0
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,25.0,0.0
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,30.0,0.0


In [89]:
train,test = train_test_split(df_full_agg, test_size=0.2, random_state=42, shuffle=True)

In [91]:
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

In [92]:
text_train = train['concate'].values
num_train = train['annotatorAge'].values

In [93]:
text_test = test['concate'].values
num_test = test['annotatorAge'].values

In [94]:
X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

In [95]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [96]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [99]:
X_train_text.shape

(70772, 100)

In [97]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [100]:
from tensorflow import keras

In [104]:
inp_cat_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
conc = keras.layers.Concatenate()([flatten, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,num_train], y_train,
                    epochs=5,
                    validation_data=([X_test_text,num_test], y_test),
                    batch_size=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Explore model structure by feeding demographic features with 2 different pipeline
- Created categorical embedding as well as text embedding for the existing features
- flattened and concategated the 3 type of data (text, categorical, numeric demographic) and feed to the dense layers for model learning

In [24]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [56]:
df_full_agg

Unnamed: 0,clean_post,annotatorAge,offensiveYN,annotatorGender_man,annotatorGender_na,annotatorGender_nonBinary,annotatorGender_transman,annotatorGender_woman,annotatorRace_asian,annotatorRace_black,annotatorRace_hisp,annotatorRace_na,annotatorRace_native,annotatorRace_other,annotatorRace_white
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",41.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
1,"\n\nBill Kristol and Ben Shaprio, two turds in...",42.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
2,"\n\nBill Kristol and Ben Shaprio, two turds in...",39.0,1.0,0,0,0,0,1,0,0,0,0,0,0,1
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,25.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,30.0,0.0,0,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88460,📖 2Kings 22:19 because your heart was peniten...,35.0,0.0,0,0,0,0,1,0,0,0,0,0,0,1
88461,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,29.0,1.0,0,0,0,0,1,0,0,1,0,0,0,0
88462,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,34.0,0.5,0,0,0,0,1,0,0,0,0,0,0,1
88463,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,49.0,0.0,0,0,0,0,1,0,0,0,0,0,0,1


In [55]:
df_full_agg = pd.get_dummies(df_full_agg, columns = ['annotatorGender','annotatorRace'])

In [57]:
train,test = train_test_split(df_full_agg, test_size=0.2, random_state=42, shuffle=True)

In [58]:
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

In [59]:
text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

In [65]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text_train)

In [60]:
text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [66]:
X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

In [67]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [68]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [74]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [69]:
X_train_text.shape

(70772, 100)

In [70]:
cat_train.shape

(70772, 12)

In [52]:
cat_embedding = min(np.ceil((cat.shape[1])/2), 50 )
cat_embedding_size = int(cat_embedding)

In [53]:
from tensorflow import keras

In [76]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=5,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
