# Jigsaw Unintended Bias in Toxicity Classification


## Download Data From Kaggle

In [None]:
!pip install google
from google.colab import drive
drive.mount("/content/drive")

In [None]:
from google.colab import files

In [None]:
files.upload()

In [None]:
!pip install --upgrade --force-reinstall --no-deps kaggle
!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification
!mkdir data
!unzip ./*.zip -d ./data/65

## Libraries

In [None]:
!pip install contractions

In [None]:
import pandas as pd
from prettytable import PrettyTable
import nltk
from nltk.corpus import stopwords
from tqdm import trange
import contractions
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence


In [None]:
nltk.download('stopwords')
stopwords_list = stopwords.words("english")
pd.set_option("display.max_colwidth", None)

In [None]:
EPOCH = 50
EMBED_SIZE = 300

## Helper Functions

In [None]:
def simplify_comment(comment):
    simplified_comment = contractions.fix(comment)
    simplified_comment = simplified_comment.replace("\\n", ' ')
    simplified_comment = simplified_comment.replace("\\r", ' ')
    simplified_comment = simplified_comment.replace("\\", ' ')
    simplified_comment = re.sub("[^A-Za-z0-9]+", ' ', simplified_comment)
    
    # in order to prevent unwanted blanks and
    # if a word is one of the stop words.
    return ' '.join(word.strip().lower() for word in simplified_comment.split() if not word in stopwords_list)

## Data Cleansing

In [None]:
df = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
df = df.sample(len(df)//10)
df_test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv")
df.head(10)

In [None]:
pt = PrettyTable()
pt.field_names = ["Area", "Count"]
pt.add_row(["Features", len(df.columns)])
pt.add_row(["Train data rows", len(df)])
pt.add_row(["Test data rows", len(df_test)])
print(pt)

In [None]:
most_toxic = df.sort_values(by=["target"]).iloc[-1, :]
least_toxic = df.sort_values(by=["target"]).iloc[0, :]

In [None]:
print(f"Most toxic comment: {most_toxic['comment_text']}")

In [None]:
print(f"Least toxic comment: {least_toxic['comment_text']}")

In [None]:
df["comment_text"] = df["comment_text"].apply(lambda row: simplify_comment(row))

In [None]:
df["comment_text"].head()

In [None]:
df_test["comment_text"] = df_test["comment_text"].apply(lambda row: simplify_comment(row))

In [None]:
df_test["comment_text"].head()

In [None]:
df.loc[df["target"] >= 0.5, "target"] = 1
df.loc[df["target"] < 0.5, "target"] = 0

In [None]:
df.isnull().sum()/len(df)

In [None]:
all_words = []
maxlen = -9999
for sentence in tqdm(df["comment_text"]):
    maxlen = maxlen if maxlen > len(sentence.split()) else len(sentence.split())
    for word in sentence.split():
        all_words.append(word)
all_words = set(all_words)
num_words = len(all_words)//10

## Data Visualization

In [None]:
sns.displot(df["target"], bins=np.arange(0, 1, 0.1), stat="density")

In [None]:
sns.displot(df, x="target", kind="kde")

In [None]:
text = str(df.loc[df["target"] > 0.7]["comment_text"].sample(len(df)//100))
wc = WordCloud(
    background_color = 'white', 
    width = 1920, 
    height = 1080,
    )
wc.generate_from_text(text)
plt.figure(figsize = (12, 12))
plt.title("Target > 0.7")
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.imshow(wc) 
plt.show()

In [None]:
text = str(df.loc[df["target"] < 0.7]["comment_text"].sample(len(df)//10))
wc = WordCloud(
    background_color = 'white', 
    width = 1920, 
    height = 1080,)
wc.generate_from_text(text)
plt.figure(figsize = (12, 12))
plt.title("Target < 0.7")
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.imshow(wc) 
plt.show()

## Data Preprocessing

In [None]:
df_test["comment_text"].head()

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(
    df["comment_text"].values.tolist() + df_test["comment_text"].values.tolist())

In [None]:
X_train = tokenizer.texts_to_sequences(df["comment_text"])
X_test = tokenizer.texts_to_sequences(df_test["comment_text"])

In [None]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

## Model Building

In [None]:
embedding_matrix = np.zeros((num_words, EMBED_SIZE))

input_layer = tf.keras.layers.Input(shape=(maxlen,))

x = tf.keras.layers.Embedding(num_words, EMBED_SIZE, weights=[embedding_matrix], trainable=False)(input_layer)
x = tf.keras.layers.SpatialDropout1D(0.2)(x)
x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = tf.keras.layers.Conv1D(128, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform", activation="relu")(x)

avg_pool = tf.keras.layers.GlobalAveragePooling1D()(x)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(x)
x = tf.keras.layers.concatenate([avg_pool, max_pool])

x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)

x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)

output = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.models.Model(inputs=input_layer, outputs=output)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",
                   min_delta=0,
                   patience=3,
                   verbose=0, mode="auto")

best_model = "./model_{epoch:02d}.h5"
checkpoint = tf.keras.callbacks.ModelCheckpoint(best_model, monitor="val_loss", verbose=0, save_best_only=False, mode='auto')

In [None]:
model.fit(X_train, df["target"], batch_size=256, epochs=EPOCH, callbacks=[es, checkpoint], validation_split=0.1)