In [1]:
import pandas as pd
import numpy as np
import pickle
import nltk
import random
from sklearn.model_selection import train_test_split
from pre_processing.textProcessing import TextPreProcessor
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization
from keras.utils import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, auc, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler

def evaluate_classifier(y_true, y_pred, print_result=False):
    """
    Given a predicted and true value, get the performance measurement of the classifier
    """
    accr=accuracy_score(y_true, y_pred)
    precision=precision_score(y_true,y_pred)
    recall=recall_score(y_true,y_pred)
    pct_maj = max(np.mean(y_true), 1-np.mean(y_true))
    
    if print_result:
        print(f"% majority class: {pct_maj}, Accuracy: {accr}, Precision: {precision}, Recall: {recall}")
    return (pct_maj, accr, precision, recall)

def undersample_data(X_train, y_train):
    """
    Undersample data
    """
    rus = RandomUnderSampler(random_state=0)
    X_resampled, y_resampled = rus.fit_resample(X_train_DTM, Y_train.toxic)
    return X_resampled, y_resampled

def get_model_output(model, X_train, Y_train, X_test, Y_test, model_name='', y_labels=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], undersample=False):
    """
    Automated the training and performance of traditional ML models
    """
    output = pd.DataFrame(columns=['model_name', 'label','undersampled','pct_maj_class','accr','precision','recall'])
    for label in y_labels:
        # Get the label to use
        y_train = Y_train[label]
        y_test = Y_test[label]
        
        if undersample:
            X_train, y_train = undersample_data(X_train, y_train)
        
        # Fit the model
        fitted = model.fit(X_train, y_train)
        pred = fitted.predict(X_test)
        
        # Get the results
        result = [model_name, label, undersample] + list(evaluate_classifier(y_true=y_test, y_pred=pred))
        output.loc[len(output)] = result
    return output

## Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')

# Sample the data
# train = train.sample(10000, random_state=1)
labels = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]

## Traning and testing split
random.seed(923)

X_train, X_test, Y_train, Y_test = train_test_split(train.comment_text, labels, test_size = 0.25,random_state = 23)

X_train.reset_index(drop = True,inplace = True)
X_test.reset_index(drop = True,inplace = True)
Y_train.reset_index(drop = True,inplace = True)
Y_test.reset_index(drop = True,inplace = True)

print(f"Shape of the training set: {X_train.shape}")
print(f"Shape of the testing set: {X_test.shape}")

Shape of the training set: (119678,)
Shape of the testing set: (39893,)


In [3]:
Y_train.columns

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

## EDA

In [None]:
train.describe()

def wordcloud(column, colormap, value = True):
    if value: 
        subset = train[train[column] == value]
    else: 
        subset = train[train[column] == 1]
    text = subset.comment_text.values
    
#     image_path = './input/images/'+image
#     my_mask = np.array(Image.open(image_path))
#     my_mask = my_mask[:,:,1]
    
    word = WordCloud(width = 1400, height =800,
                    background_color = 'white',
                    #mask = my_mask,
                    max_words = 3000,
                    random_state = 50,
                    #scale  = 2 
                    ).generate(' '.join(text))
    plt.axis('off')
    plt.title(f'High frequency words in {column.title()} Comments', fontsize = 20)
    plt.imshow(word.recolor(colormap = colormap, random_state = 17))
    
plt.figure(figsize = (12,12))
wordcloud('empty_cat', 'viridis')

In [None]:
plt.figure(figsize = (12,12))
wordcloud('threat', 'Wistia', False)

In [None]:
print('Number of Labels by Label')
label_count = train.iloc[:, 2:].sum()
label_count

In [None]:
# Check for records that have multiple classifications
classified = train.iloc[:, 2:].sum(axis=1)

mult_class = classified.value_counts()

mult_class.plot(x=mult_class, 
             y=mult_class.values, 
             kind='bar', 
             xlabel='Number of Labels', 
             ylabel='Records')
plt.title('Number of Labels per Record')

pass

In [None]:
# Check for records that have no classifications and data imbalance
train['empty_cat'] = (classified == 0)
train['empty_cat'] = train['empty_cat'].astype(int)
 
cat_tot = train.iloc[:, 2:].sum()

cat_tot.plot(x=cat_tot, 
             y=cat_tot.values, 
             kind='bar', 
             xlabel='Label', 
             ylabel='Records')
plt.title('Number of Labels by Label')

In [None]:
print('Total comments:', len(train))
print('Total nontoxic comments:', train['empty_cat'].sum())
print('Total toxic comments:', len(train) - train['empty_cat'].sum())
print('Total labels:' , train.loc[:, 'toxic':'identity_hate'].sum().sum())

## Pre-processing

In this section, we will pre-process the data into two kinds of formats:

1. Document-Term Matrix
2. Padded numeric sequences

We will use the DTM for non RNN models, and the padded numeric sequences for recurrent neural net work based models

### Processing the documents into DTMs

In [4]:
def tokenize(text):
    """
    Define a function that take in a text and process the doc
    """
    return TextPreProcessor(text=text, lemma_flag=True, stem_flag=False).process()

# Fit a tf-idf vectorizer and filter-out the terms with less than 15 occurances or appears in more than 90% of the documents 
vec_tfidf = TfidfVectorizer(ngram_range=(1,1),tokenizer=tokenize,min_df=15, max_df=0.9)
vec_tfidf_fitted = vec_tfidf.fit(X_train)

  "The parameter 'token_pattern' will not be used"


In [None]:
X_train_DTM = vec_tfidf_fitted.transform(X_train)
X_test_DTM = vec_tfidf_fitted.transform(X_test)

In [None]:
pd.DataFrame(X_train_DTM.toarray()).head()

### Processing the documents into padded sequences

In [None]:
VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 150
# Create the tokenizer
t = Tokenizer()
# Fit the tokenizer on the documents
t.fit_on_texts(X_train)

"""
The word index for keras Tokenizer is ordered based on frequency. Therefore we can do the following according to
https://github.com/keras-team/keras/issues/8092
"""
t.oov_token = '_unknown_'
t.word_index = {e:i for e,i in t.word_index.items() if i <= VOCAB_SIZE} # <= because tokenizer is 1 indexed
t.word_index[t.oov_token] = VOCAB_SIZE + 1

"""
Apply the tokenizer
"""
encoded_docs = t.texts_to_sequences(X_train)

"""
Padd the sequences
"""
padded_docs = pad_sequences(encoded_docs, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')

print("Original text:")
print(X_train[0])
print("Vectorized:")
print(encoded_docs[0])
print("Padded:")
print(padded_docs[0])

## Modeling

### Multinomial Naive Bayes Classifier

In [None]:
model = MultinomialNB()
non_undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Naive Bayes',
    undersample=False
)

undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Naive Bayes',
    undersample=True
)

multi_nb_results = non_undersampled.append(undersampled, ignore_index=True)
multi_nb_results

### Logicstic Regression

In [None]:
non_undersampled = get_model_output(
    LogisticRegression(), 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Logistic Regression',
    undersample=False
)

undersampled = get_model_output(
    LogisticRegression(), 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Logistic Regression',
    undersample=True
)

logicstic_regression_results = non_undersampled.append(undersampled, ignore_index=True)

In [None]:
logicstic_regression_results

### Linear SVC

In [None]:
model = LinearSVC()
non_undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Linear SVC',
    undersample=False
)

undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Linear SVC',
    undersample=True
)

svc_results = non_undersampled.append(undersampled, ignore_index=True)
svc_results

### Random Forest

In [None]:
model = RandomForestClassifier(n_estimators=200)
non_undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Random Forest (200 trees)',
    undersample=False
)

undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='Random Forest (200 trees)',
    undersample=True
)

rf_results = non_undersampled.append(undersampled, ignore_index=True)
rf_results

In [None]:
rf_results

### XGboost

In [None]:
model = XGBClassifier(n_estimators=200)
non_undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='XGBoost (200 trees)',
    undersample=False
)

undersampled = get_model_output(
    model, 
    X_train_DTM, 
    Y_train, 
    X_test_DTM, 
    Y_test, 
    model_name='XGBoost (200 trees)',
    undersample=True
)

xgb_results = non_undersampled.append(undersampled, ignore_index=True)
xgb_results

### Neural Network

In [None]:
"""
Model 1: Bidirectional LSTM - not undersampled
"""
def create_model(vocab_size, num_labels, sequence_length):
    model = tf.keras.Sequential([
        layers.Input(shape=(sequence_length,)),
        layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=sequence_length, mask_zero=True),
        layers.Dropout(0.4),
        layers.Bidirectional(layers.LSTM(64,return_sequences = False,dropout = 0.2,recurrent_dropout = 0.2)),
        layers.Dense(25,activation = 'relu'),
        layers.Dense(num_labels,activation = 'sigmoid')
    ])
    return model

model = create_model(len(t.word_index)+1, num_labels=6, sequence_length=MAX_SEQUENCE_LENGTH)
model.compile(
    loss=losses.binary_crossentropy,
    optimizer='adam',
    metrics=['accuracy']
)

from keras.callbacks import EarlyStopping
# Define an early-stopping callback
cb = [EarlyStopping(monitor='val_loss',patience = 2)]

batch_size = 32
num_epochs = 20

model.fit(padded_docs, Y_train,validation_split=0.2,\
          batch_size=batch_size, epochs=num_epochs,verbose=1,callbacks =cb)

In [None]:
encoded_doc_test = t.texts_to_sequences(X_test)
padded_doc_test = pad_sequences(encoded_doc_test, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
pred = model.predict(padded_doc_test)

"""
Construct the final model
"""
model_name = 'Bi-LSTM + Embedding'
undersampled = False
bilstm_results = pd.DataFrame(columns=['model_name', 'label','undersampled','pct_maj_class','accr','precision','recall'])
for i in range(6):
    y_pred_bin = [x>0.5 for x in pred[:,i]]
    label = Y_train.columns[i]
    y_test = Y_test
    result = [model_name, label, undersampled] + list(evaluate_classifier(y_true=y_test.iloc[:,i], y_pred=y_pred_bin))
    bilstm_results.loc[len(bilstm_results)] = result

In [None]:
bilstm_results

In [None]:
"""
Model 2: LSTM only
"""

## Consolidate results

In [None]:
all_results = pd.concat([multi_nb_results, svc_results, logicstic_regression_results,rf_results,xgb_results,bilstm_results])

In [None]:
all_results

In [None]:
all_results.to_csv('../output/all_results.csv',index=False)