In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 200000
embedding_dim = 16
max_length = 80
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"
training_size = 140000

In [3]:
import csv
import numpy as np
import pandas as pd

labels = []
sentences = []

with open('clean_data.csv',encoding='utf-8-sig') as file:
    data = csv.reader(file,delimiter = ',')
    for item in data:
        labels.append(item[0])
        sentences.append(item[1])
labels = [int(i) for i in labels]
train_sentences = np.array(sentences)
train_labels = np.array(labels)


In [4]:
tokenizer =Tokenizer(num_words=vocab_size,oov_token = oov_tok)
tokenizer.fit_on_texts(sentences)
training_sequences = tokenizer.texts_to_sequences(train_sentences)
training_padded = pad_sequences(training_sequences,maxlen = max_length,padding = padding_type,truncating = trunc_type)


In [5]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 80, 16)            3200000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               41472     
_________________________________________________________________
dense (Dense)                (None, 24)                3096      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 3,244,593
Trainable params: 3,244,593
Non-trainable params: 0
_________________________________________________________________


In [6]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint_path = r'C:\Users\krite\Desktop\Search Engine'
cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path, 
    verbose=1, 
    save_weights_only=True,
    save_best_only=True,
    save_freq='epoch')

num_epochs = 5
# mathiko callback to be implemented in callbacks of fit, but doesn't save the model for now idk why
history = model.fit(training_padded, train_labels, epochs=num_epochs, verbose=1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
model.save('./checkpoint/profanity_model.h5')
#saving mero model ho ma model

In [8]:
sep = tf.keras.models.load_model('./checkpoint/profanity_model.h5')

In [9]:
sep.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 80, 16)            3200000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               41472     
_________________________________________________________________
dense (Dense)                (None, 24)                3096      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 3,244,593
Trainable params: 3,244,593
Non-trainable params: 0
_________________________________________________________________


In [21]:
sentence = ['fuck']
tokenized = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(tokenized,maxlen = max_length,padding = padding_type,truncating = trunc_type)
print(sep.predict(padded))

[[0.99990195]]


In [20]:
import pickle

with open('./checkpoint/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)