In [11]:
#pip installs
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn


In [9]:
#imports
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding
from tensorflow import keras

In [60]:
#reading the data from csv file
df = pd.read_csv(os.path.join('data','train.csv'))

In [11]:
#uncomment the below codeline to print data within the csv file
# df.head()

In [12]:
#extracting comments from "comment_text" column
comments_x = df['comment_text'] 

#grab all values after first 2 columns since we dont need id and we already have comment_text in comments_x which is a 1D array variable
labels_y = df[df.columns[2:]].values 

#in a nutshell, comments_x holds all the data comments and labels_y represents corresponding labels


In [13]:
# number of words to be included in the our vectorization layer(considering reducing value of less vram :D)
maxWords = 200000 


In [14]:
#Creating our Text Vectorization layer
vectorizer_layer = TextVectorization(max_tokens=maxWords,
                               output_sequence_length=1800,
                               output_mode='int')

vectorizer_layer.adapt(comments_x.values)
    

In [15]:
#adding the vectorizing layer to our comments data
vectorized_data = vectorizer_layer(comments_x.values)


In [16]:
#creating our dataset 
dataset = tf.data.Dataset.from_tensor_slices((vectorized_data, labels_y))

In [17]:
#caching the dataset
dataset = dataset.cache()


In [18]:
#shuffling the dataset
dataset = dataset.shuffle(160000)


In [19]:
#defining the batch size(please reduce if low vram)
dataset = dataset.batch(16)


In [20]:
#prefetch method sort of helps performance (to read more about it=> https://www.tensorflow.org/api_docs/python/tf/data/Dataset#prefetch)
dataset = dataset.prefetch(8) 

In [21]:
#dividing our set into training,testing and valuation set
train = dataset.take(int(len(dataset)*.7))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))

In [22]:
#Creating the model :D

#Defining a sequential model
model = Sequential()


In [24]:
# Creating the embedding layer 
model.add(Embedding(maxWords+1, 32))


In [25]:
# Creating a Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))


In [26]:
# Creating Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))


In [27]:
# And the last but not the least, Final layer!!
model.add(Dense(6, activation='sigmoid'))

In [28]:
#Lets compile the model with a Optimizer "Adam" and a loss logic "BinaryCrossEntropy"
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [30]:
#Training the model for 1-epoch for now (feel free to adjust it as per your system specs)
# Trained_Model = model.fit(train, epochs=1, validation_data=val)

In [None]:
#uncomment and use for saving our trained model with .h5 extension so we can use it outside of this notebook ;)
#Trained_Model.save('Model.h5')

In [58]:
'''Now lets test our model, since we are trying within the notebook and i do not have enough vram, 
lets use the model.h5 file which i trained on google colab as our loading point :D'''

#loading the model
model_path = os.path.join('Data','toxicity.h5')
model = keras.models.load_model(model_path)

#Time to test!

#taking an input
test_input = input("Input something: ")

#putting the input for prediction
input_text = vectorizer_layer(test_input)
res = model.predict(np.expand_dims(input_text,0))



Input something: asshole


In [59]:
#processing the result in a more human readable way
text = ''
for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, res[0][idx]>0.5)
    
text

'toxic: False\nsevere_toxic: False\nobscene: False\nthreat: False\ninsult: False\nidentity_hate: False\n'