In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.models import load_model
import joblib

In [2]:
df = pd.read_csv('dataset/train.csv')

In [3]:
df.head(-1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159565,ffe8b9316245be30,The numbers in parentheses are the additional ...,0,0,0,0,0,0
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0


In [4]:
df = df[['comment_text', 'toxic']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


### Preprocessing

In [8]:
X = df['comment_text'].astype(str)
y = df['toxic']

In [9]:
# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

### Feature Extraction

In [10]:
# Tokenization
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

In [11]:
# Padding sequences
max_len = 200
X_pad = pad_sequences(X_seq, maxlen=max_len)

### Training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [13]:
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=1, activation='sigmoid'))




In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])




In [15]:
epochs = 5
batch_size = 32
model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x278f04f9550>

In [16]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.14108331501483917, Test Accuracy: 0.9592354893684387


In [20]:
model.save('model.h5')

  saving_api.save_model(


### Prediction

In [40]:
joblib.dump(tokenizer,'app/tokenizer.pkl')

['app/tokenizer.pkl']

In [41]:
joblib.dump(label_encoder,'app/label_encoder.pkl')

['app/label_encoder.pkl']

In [30]:
Toxic_model = load_model('app/model.h5')

In [35]:
new_comment = "You are the worst person I have ever met. I hate you."

new_comment_seq = tokenizer.texts_to_sequences([new_comment])
new_comment_pad = pad_sequences(new_comment_seq, maxlen=max_len)

predictions = model.predict(new_comment_pad)

predicted_label = label_encoder.inverse_transform([1 if pred > 0.5 else 0 for pred in predictions])

print("Predicted Label:", predicted_label)

Predicted Label: [1]


### Tensorflowjs

In [5]:
!pip install tensorflowjs

^C


In [1]:
import tensorflow as tf




In [3]:
model = tf.keras.models.load_model('api/model/model.h5')




In [7]:
!tensorflowjs_converter --input_format keras api/model/model.h5 api/model/

2024-04-04 15:59:45.062461: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\habee\AppData\Local\Programs\Python\Python311\Scripts\tensorflowjs_converter.exe\__main__.py", line 4, in <module>
  File "C:\Users\habee\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflowjs\__init__.py", line 21, in <module>
    from tensorflowjs import converters
  File "C:\Users\habee\AppData\Local\Programs\Python\Python311\Lib\site-packages\tensorflowjs\converters\__init__.py", line 21, in <module>
    from tensorflowjs.converters.converter import convert
  File "C:\Users\habee\AppData\Local\Programs\Python\Python311\Lib