In [79]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras import layers

In [46]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                                        vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

## data read in and simple cleaning

In [47]:
df_trn = pd.read_csv('SBF_trn.csv')

In [48]:
df_full = df_trn[['post','annotatorGender','annotatorRace','annotatorAge','offensiveYN']].copy()

In [49]:
df_full.shape

(112900, 5)

In [50]:
pattern = '^RT.*: '
pattern_2 ='&#[^a-zA-Z]+;$'

In [51]:
df_full['clean_post']=[re.sub(pattern_2,'',re.sub(pattern,'',x)) for x in df_full['post']]

In [52]:
df_full = df_full[df_full['offensiveYN'].notna()]

In [53]:
df_full['label']= [x if x!=0.5 else 3 for x in df_full['offensiveYN']]

In [54]:
df_full.shape

(110883, 7)

In [55]:
df_full_agg = df_full.groupby(by=["clean_post",'annotatorGender','annotatorRace','annotatorAge'])['offensiveYN'].agg(lambda x:pd.Series.mode(x)[0]).reset_index()

In [56]:
df_full_agg.shape

(88465, 5)

### Model with smaller data set to compare with baseline

In [64]:
#35419
sample_df = df_full_agg.sample(n=35419)

In [65]:
sample_df.shape

(35419, 5)

In [66]:
df_embedding_sample  = pd.get_dummies(sample_df, columns = ['annotatorGender','annotatorRace'])

train,test = train_test_split(df_embedding_sample, test_size=0.2, random_state=42, shuffle=True)
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [67]:
tokenizer = Tokenizer(num_words=5000) ### change 
tokenizer.fit_on_texts(text_train)

X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [68]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [69]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [70]:
cat_embedding = min(np.ceil((cat_train.shape[1])/2), 50 ) ## determine categorical embedding size using conventional method
cat_embedding_size = int(cat_embedding)

In [71]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat_train.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat_train.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=7,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### Model with regularization paramenters

In [72]:
df_embedding_second  = pd.get_dummies(df_full_agg, columns = ['annotatorGender','annotatorRace'])


In [73]:
train,test = train_test_split(df_embedding_second, test_size=0.2, random_state=42, shuffle=True)
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [74]:
tokenizer = Tokenizer(num_words=5000) ### change 
tokenizer.fit_on_texts(text_train)

X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [75]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [76]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [77]:
cat_embedding = min(np.ceil((cat_train.shape[1])/2), 50 ) ## determine categorical embedding size using conventional method
cat_embedding_size = int(cat_embedding)

In [82]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat_train.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat_train.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu',kernel_regularizer = regularizers.l1_l2(l1=1e-5,l2=1e-4)
                             ,activity_regularizer = regularizers.l2(1e-5))(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
dropout_1 = keras.layers.Dropout(0.2)(Dense_1)
Dense_2 = keras.layers.Dense(128, activation='relu')(dropout_1)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=10,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Model with separate embedding and  different max tokens

In [57]:
df_embedding_second  = pd.get_dummies(df_full_agg, columns = ['annotatorGender','annotatorRace'])

In [58]:
train,test = train_test_split(df_embedding_second, test_size=0.2, random_state=42, shuffle=True)
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [59]:
tokenizer = Tokenizer(num_words=10000) ### change 
tokenizer.fit_on_texts(text_train)

X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

vocab_size = len(tokenizer.word_index) + 1
maxlen = 100


In [60]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [61]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [62]:
cat_embedding = min(np.ceil((cat_train.shape[1])/2), 50 ) ## determine categorical embedding size using conventional method
cat_embedding_size = int(cat_embedding)

In [63]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat_train.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat_train.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=7,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


### Model with saperated embedding and removed stopwords

In [20]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/hwu24/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
stop_words = set(stopwords.words('english'))

In [22]:
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
#text = pattern.sub('', text)

In [28]:
df_full_agg['clean_post'] = df_full_agg['clean_post'].map(lambda x: pattern.sub('', x))

In [29]:
df_full_agg.iloc[0,0]

'\n\nBill Kristol Ben Shaprio, two turds toilet bowl.\n\n'

In [30]:
df_embedding_second  = pd.get_dummies(df_full_agg, columns = ['annotatorGender','annotatorRace'])

In [31]:
df_embedding_second.head()

Unnamed: 0,clean_post,annotatorAge,offensiveYN,annotatorGender_man,annotatorGender_na,annotatorGender_nonBinary,annotatorGender_transman,annotatorGender_woman,annotatorRace_asian,annotatorRace_black,annotatorRace_hisp,annotatorRace_na,annotatorRace_native,annotatorRace_other,annotatorRace_white
0,"\n\nBill Kristol Ben Shaprio, two turds toilet...",41.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
1,"\n\nBill Kristol Ben Shaprio, two turds toilet...",42.0,1.0,1,0,0,0,0,0,0,0,0,0,0,1
2,"\n\nBill Kristol Ben Shaprio, two turds toilet...",39.0,1.0,0,0,0,0,1,0,0,0,0,0,0,1
3,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,25.0,0.0,1,0,0,0,0,0,0,0,0,0,0,1
4,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,30.0,0.0,0,0,0,0,1,0,0,0,0,0,0,1


### Prepare data for model consumption

In [32]:
train,test = train_test_split(df_embedding_second, test_size=0.2, random_state=42, shuffle=True)

In [33]:
y_train = to_categorical(train['offensiveYN'].values, 3)
y_test = to_categorical(test['offensiveYN'].values, 3)

In [34]:
text_train = train['clean_post'].values
num_train = train['annotatorAge'].values
cat_train = train.iloc[:,3:].to_numpy()

In [35]:
tokenizer = Tokenizer(num_words=5000) ### change 
tokenizer.fit_on_texts(text_train)

In [36]:
text_test = test['clean_post'].values
num_test = test['annotatorAge'].values
cat_test = test.iloc[:,3:].to_numpy()

In [37]:
X_train_text = tokenizer.texts_to_sequences(text_train)
X_test_text = tokenizer.texts_to_sequences(text_test)

In [38]:
vocab_size = len(tokenizer.word_index) + 1
maxlen = 100

In [39]:
X_train_text = pad_sequences(X_train_text, padding='post', maxlen=maxlen)
X_test_text = pad_sequences(X_test_text, padding='post', maxlen=maxlen)

In [40]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt' ,
                                            tokenizer.word_index,  
                                            embedding_dim)

In [41]:
X_train_text.shape

(70772, 100)

In [42]:
cat_train.shape

(70772, 12)

In [43]:
cat_embedding = min(np.ceil((cat_train.shape[1])/2), 50 ) ## determine categorical embedding size using conventional method
cat_embedding_size = int(cat_embedding)

### Feeding training data into model and monitor validation set performance

In [44]:
inp_text_data = keras.layers.Input(shape=(X_train_text.shape[1],))
inp_cat_data = keras.layers.Input(shape=(cat_train.shape[1],))
inp_num_data = keras.layers.Input(shape=(1,))
emb= keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(inp_text_data)
emb_2 = keras.layers.Embedding(input_dim=cat_train.shape[1], output_dim=cat_embedding_size)(inp_cat_data)
conv_1 = keras.layers.Conv1D(128, 5, activation='relu')(emb)
conv_2 = keras.layers.Conv1D(128, 5, activation='relu')(conv_1)
pool = keras.layers.GlobalMaxPooling1D()(conv_2)
flatten = keras.layers.Flatten()(pool)
flatten_2 = keras.layers.Flatten()(emb_2)
conc = keras.layers.Concatenate()([flatten,flatten_2, inp_num_data])
Dense_1 = keras.layers.Dense(128, activation='relu')(conc)
out = keras.layers.Dense(3, activation='sigmoid')(Dense_1)

model = keras.Model(inputs=[inp_text_data,inp_cat_data, inp_num_data], outputs=out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit([X_train_text,cat_train,num_train], y_train,
                    epochs=7,
                    validation_data=([X_test_text,cat_test,num_test], y_test),
                    batch_size=5)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
