In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
# unzip the files
import zipfile

datasets = ("sample_submission.csv", "test.csv", "test_labels.csv", "train.csv")

for d in datasets:
    with zipfile.ZipFile("../input/jigsaw-toxic-comment-classification-challenge/"+d+".zip","r") as z:
        z.extractall(".")

In [3]:
test = pd.read_csv("/kaggle/working/test.csv")
train = pd.read_csv("/kaggle/working/train.csv")
sample_submission = pd.read_csv("/kaggle/working/sample_submission.csv")
test_labels = pd.read_csv("/kaggle/working/test_labels.csv")

# Text pre-processing

In [4]:
import re
from bs4 import BeautifulSoup


def pre_process(text):
    text = BeautifulSoup(text).get_text()
    # fetch alphabetic characters

    text = re.sub("[^a-zA-Z]", " ", text)
    # convert text to lower case
    text = text.lower()
    # split text into tokens to remove whitespaces
    tokens = text.split()
    return " ".join(tokens)

train['comment_text'] = train['comment_text'].map(pre_process)

  text = BeautifulSoup(text).get_text()
  text = BeautifulSoup(text).get_text()


In [5]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
train['comment_text'] = train['comment_text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train['word_count'] = train['comment_text'].apply(lambda x: len(str(x).split(" ")))



In [6]:
from nltk.stem.snowball import SnowballStemmer

#stemming
stemmer = SnowballStemmer("english")
def stemming(sentence):
    stemSentence = ""
    for word in sentence.split():
        stem = stemmer.stem(word)
        stemSentence += stem
        stemSentence += " "
    stemSentence = stemSentence.strip()
    return stemSentence

train['comment_text'] = train['comment_text'].apply(stemming)

## tfidf

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
countvectorizer = CountVectorizer(analyzer='word' , stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word' , stop_words='english',)

In [8]:
corpus = train.comment_text
X = tfidfvectorizer.fit_transform(corpus)
X.shape

(159571, 131949)

In [9]:
tfidf_tokens = tfidfvectorizer.get_feature_names_out()

In [10]:
y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

### sample 10,000 data to demo and get the model to run

In [11]:
import random
from random import sample
random.seed(4)
i = sample(range(X.shape[0]), 10000)

In [12]:
X_new = X[i]
X_new = X_new.toarray()
y_new = y.iloc[i]

## stratified sampling to handle imbalanced data for multilabel classification
- useful [reference](https://github.com/scikit-multilearn/scikit-multilearn/issues/194) to get data into right format

In [13]:
from skmultilearn.model_selection import iterative_train_test_split
x_train, y_train, x_test, y_test = iterative_train_test_split(X_new, y_new.values, test_size = 0.1)

# Model

In [15]:
# Imports
from sklearn.datasets import make_multilabel_classification
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [16]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [17]:
# model config
N_FEATURES = X.shape[1]
N_CLASSES = len(labels) 
N_EPOCHS = 50
RANDOM_STATE = 35
BATCH_SIZE = 250
VERBOSITY = 1
VALIDATION_SPLIT = 0.2

In [22]:
model = Sequential()
# layers get narrower, larger details captured first
model.add(Dense(64, activation='relu', input_dim=N_FEATURES))
model.add(Dense(32, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
# sigmoid layer in last layer to generate prediction value within [0,1]
model.add(Dense(N_CLASSES, activation='sigmoid'))

In [23]:
from tensorflow import keras

In [24]:
# Compile the model
model.compile(loss=binary_crossentropy, # for multi-label classification
              optimizer=Adam(),
              metrics=['accuracy', keras.metrics.AUC(name='auc')])

In [25]:
# Fit data to model
model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS,
          verbose=VERBOSITY,
          validation_split=VALIDATION_SPLIT)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7ca2be921000>

### Evlauate on test dataset

In [26]:
score = model.evaluate(x_test, y_test, verbose=0)
print(f'Test loss: {round(score[0], 4)} / AUC: {round(score[2],4)}')

Test loss: 0.2087 / AUC: 0.8205
