# 0. Install Dependencies and Bring in Data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow tensorflow-gpu pandas matplotlib sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (588.3 MB)
[K     |████████████████████████████████| 588.3 MB 23 kB/s 
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Collecting tensorflow-gpu
  Downloading tensorflow_gpu-2.10.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.1 MB)
[K     |████████████████████████████████| 578.1 MB 8.2 kB/s 
[?25h  Downloading tensorflow_gpu-2.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (578.1 MB)
[K     |████████████████████████████████| 578.1 MB 7.0 kB/s 
[?25h  Downloading tensorflow_gpu-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (511.8 MB)
[K     |████████████████████████████████| 511.8 MB 28 kB/s 
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone


In [5]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [6]:
os.path.join('jigsaw-toxic-comment-classification -challenge', 'train.csv', 'train.csv')

'jigsaw-toxic-comment-classification -challenge/train.csv/train.csv'

In [7]:
df = pd.read_csv("/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/train.csv/train.csv")

In [8]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# 1. Preprocess

In [None]:
!pip list

In [10]:
from tensorflow.keras.layers import TextVectorization

In [11]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [12]:
df[df.columns[2:]]

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [13]:
MAX_FEATURES = 200000 # number of words in the vocab

In [14]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [15]:
vectorizer.adapt(X.values)

In [16]:
vectorized_text = vectorizer(X.values)

In [17]:
#MCSHBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file
#here we are creating dataset and then in each step we are appling data pre processing
#pipeline steps
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16) #divide into batches
dataset = dataset.prefetch(8) # helps bottlenecks

In [22]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# 2. Create Sequential Model

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [20]:
model = Sequential()
# Create the embedding layer 
model.add(Embedding(MAX_FEATURES+1, 32))
# Bidirectional LSTM Layer
# +1 is added for unknown words
model.add(Bidirectional(LSTM(32, activation='tanh')))
# tanh for gpu acivation
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer 
model.add(Dense(6, activation='sigmoid'))

In [21]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train, epochs=1, validation_data=val)

  90/6981 [..............................] - ETA: 2:53:16 - loss: 0.2125

In [1]:
from matplotlib import pyplot as plt

In [2]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

NameError: ignored

<Figure size 576x360 with 0 Axes>

# 3. Make Predictions

In [45]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [50]:
res = model.predict(np.expand_dims(input_text,0))



In [51]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [52]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [53]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [55]:
res.shape

(1, 6)

In [56]:
(res > 0.5).astype(int)

array([[1, 0, 1, 0, 1, 0]])

In [57]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [58]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [59]:
res.shape

(1, 6)

# 4. Evaluate Model

In [60]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [61]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [73]:
for batch in test.as_numpy_iterator(): 
  #iterating over the batch
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [74]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

Precision: 0.8931885361671448, Recall:0.7287371158599854, Accuracy:0.5323781967163086


# 5. Test and Gradio

In [63]:
!pip install gradio jinja2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gradio
  Downloading gradio-3.12.0-py3-none-any.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 922 kB/s 
Collecting markdown-it-py[linkify,plugins]
  Downloading markdown_it_py-2.1.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.4 MB/s 
[?25hCollecting python-multipart
  Downloading python-multipart-0.0.5.tar.gz (32 kB)
Collecting websockets>=10.0
  Downloading websockets-10.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 76.2 MB/s 
Collecting fastapi
  Downloading fastapi-0.88.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.7 MB/s 
Collecting pycryptodome
  Downloading pycryptodome-3.16.0-cp35-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3 MB)


In [64]:
import tensorflow as tf
import gradio as gr

In [65]:
model.save('toxicity.h5')

In [66]:
model = tf.keras.models.load_model('toxicity.h5')

In [67]:
input_str = vectorizer('hey i freaken hate you!')

In [68]:
res = model.predict(np.expand_dims(input_str,0))



In [69]:
res

array([[0.7947936 , 0.00197563, 0.06873158, 0.01358512, 0.31772915,
        0.05019622]], dtype=float32)

In [70]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)
    
    return text

In [71]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')



In [3]:
interface.launch(share=True)

NameError: ignored