In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from tensorflow import keras

In [2]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                                        vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

### In previous explorations we found our model validation performance was still increasing after the training with limited epochs, therefore we would like to run those try out with more epochs

## data read in and simple cleaning

In [3]:
df_trn = pd.read_csv('SBF_trn.csv')

In [4]:
df_full = df_trn[['post','annotatorGender','annotatorRace','annotatorAge','offensiveYN']].copy()

In [5]:
df_full.shape

(112900, 5)

In [6]:
pattern = '^RT.*: '
pattern_2 ='&#[^a-zA-Z]+;$'

In [7]:
df_full['clean_post']=[re.sub(pattern_2,'',re.sub(pattern,'',x)) for x in df_full['post']]

In [8]:
df_full = df_full[df_full['offensiveYN'].notna()]

In [9]:
df_full['label']= [x if x!=0.5 else 3 for x in df_full['offensiveYN']]

In [10]:
df_full.shape

(110883, 7)

In [11]:
df_full_agg = df_full.groupby(by=["clean_post",'annotatorGender','annotatorRace','annotatorAge'])['offensiveYN'].agg(lambda x:pd.Series.mode(x)[0]).reset_index()

In [12]:
df_full_agg.shape

(88465, 5)

## Construct data for training: combine categorical features with text for embedding and convolution processing

In [15]:
df_embedding_one =df_full_agg.copy()

In [16]:
df_embedding_one['annotatorGender']= [' '+ x for x in df_embedding_one['annotatorGender']]
df_embedding_one['annotatorRace']= [' '+ x for x in df_embedding_one['annotatorRace']]

In [17]:
df_embedding_one['concate']= df_embedding_one['clean_post']+df_embedding_one['annotatorGender']+df_embedding_one['annotatorRace']

In [19]:
df_embedding_one =df_embedding_one[['concate','annotatorAge','offensiveYN']]
df_embedding_one.shape

(88465, 3)

In [21]:
df_embedding_one.head()

Unnamed: 0,concate,annotatorAge,offensiveYN
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",41.0,1.0
1,"\n\nBill Kristol and Ben Shaprio, two turds in...",42.0,1.0
2,"\n\nBill Kristol and Ben Shaprio, two turds in...",39.0,1.0
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,25.0,0.0
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,30.0,0.0


### setting up train test split and text feature embedding

In [22]:
train,test = train_test_split(df_embedding_one, test_size=0.2, random_state=42, shuffle=True)

In [25]:
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

text_train = train['concate'].values
num_train = train['annotatorAge'].values

text_test = test['concate'].values
num_test = test['annotatorAge'].values

In [27]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text_train)

In [28]:
X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

In [29]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [30]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [31]:
X_train_text.shape

(70772, 100)

In [32]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

### Feeding training data into model and monitor validation set performance

In [34]:
inp_cat_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
conc = keras.layers.Concatenate()([flatten, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,num_train], y_train,
                    epochs=15,
                    validation_data=([X_test_text,num_test], y_test),
                    batch_size=5)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


### Model with saperated embedding

In [13]:
df_embedding_second  = pd.get_dummies(df_full_agg, columns = ['annotatorGender','annotatorRace'])

In [14]:
df_embedding_second.head()

Unnamed: 0,clean_post,annotatorAge,offensiveYN,annotatorGender_man,annotatorGender_na,annotatorGender_nonBinary,annotatorGender_transman,annotatorGender_woman,annotatorRace_asian,annotatorRace_black,annotatorRace_hisp,annotatorRace_na,annotatorRace_native,annotatorRace_other,annotatorRace_white
0,"\n\nBill Kristol and Ben Shaprio, two turds in...",41.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
1,"\n\nBill Kristol and Ben Shaprio, two turds in...",42.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
2,"\n\nBill Kristol and Ben Shaprio, two turds in...",39.0,1.0,0,0,0,0,1,0,0,0,0,0,0,1
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,25.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,30.0,0.0,0,0,0,0,1,0,0,0,0,0,0,1


### Prepare data for model consumption

In [15]:
train,test = train_test_split(df_embedding_second, test_size=0.2, random_state=42, shuffle=True)

In [16]:
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

In [17]:
text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

In [18]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(text_train)

In [19]:
text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [20]:
X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

In [21]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [22]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [23]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [24]:
X_train_text.shape

(70772, 100)

In [25]:
cat_train.shape

(70772, 12)

In [26]:
cat_embedding = min(np.ceil((cat_train.shape[1])/2), 50 ) ## determine categorical embedding size using conventional method
cat_embedding_size = int(cat_embedding)

### Feeding training data into model and monitor validation set performance

In [27]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat_train.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat_train.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=15,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
