In [2]:
import pandas as pd, numpy as np, tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPool1D
from keras.models import Model
from Cleaning import clean

In [None]:
# Loading dataset

train = pd.read_csv('Raw Datasets/train.csv', encoding='latin-1')

In [None]:
# Separate train x from y and cleaning the dataset

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_train = train["comment_text"].apply(lambda comment: clean(comment))

In [None]:
# Tokenizing and padding the dataset

max_features = 20000
maxlen = 100

tokenizer = Tokenizer(max_features)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = tf.keras.preprocessing.sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)

In [3]:
# Making the model

# maxlen=100 as defined earlier
inp = Input(shape=(100, ))

# size of the vector space
embed_size = 128
x = Embedding(20000, embed_size)(inp)

output_dimention = 60
x = LSTM(output_dimention, return_sequences=True, name='lstm_layer')(x)
# reduce dimention
x = GlobalMaxPool1D()(x)
# disable 10% precent of the nodes
x = Dropout(0.1)(x)
# pass output through a RELU function
x = Dense(50, activation="relu")(x)
# another 10% dropout
x = Dropout(0.1)(x)
# pass the output through a sigmoid layer, since 
# we are looking for a binary (0,1) classification 
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
# we use binary_crossentropy because of binary classification
# optimise loss by Adam optimiser
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [4]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          2560000   
                                                                 
 lstm_layer (LSTM)           (None, 100, 60)           45360     
                                                                 
 global_max_pooling1d (Globa  (None, 60)               0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 60)                0         
                                                                 
 dense (Dense)               (None, 50)                3050      
                                                             

In [None]:
# Training the model

model.fit(X_t,y, batch_size=32, epochs=2, validation_split=0.1)

In [None]:
model.save('profanity_model.h5')