In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
from contextlib import suppress
with suppress(Exception):
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.preprocessing.text import Tokenizer

EMBEDDING_FILE=f'./glove.6B.50d.txt'
EMBEDDING_FILE
TRAIN_DATA_FILE=f'./train.csv'
TEST_DATA_FILE=f'./test.csv'
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)
list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
list_sentences_test = test["comment_text"].fillna("_na_").values
max(len(w) for w in train.comment_text)
max(len(w) for w in list_sentences_test)

embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

#Standard keras preprocessing, to turn each comment into a list of word indexes of equal length 
tokenizer = Tokenizer(num_words=max_features)
tokenizer


tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

#Read the glove word vectors (space delimited strings) into a dictionary from word->vector.
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(get_coefs(*o.strip().split()) 
                        for o in open(EMBEDDING_FILE))

#Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe.
#We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.
all_embs = np.stack(list(embeddings_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std


word_index = tokenizer.word_index

nb_words = min(max_features, len(word_index))


embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

#Simple bidirectional LSTM with two fully connected layers. 
#We add some dropout to the LSTM since even 2 epochs is enough to overfit.
inp = Input(shape=(maxlen,))

embedding_layer = Embedding(input_dim=max_features, output_dim=embed_size, trainable=False)
embedding_layer.build((None,))
embedding_layer.set_weights([embedding_matrix])

x = embedding_layer(inp)

x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_t, y, batch_size=32, epochs=2)


Epoch 1/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 25ms/step - accuracy: 0.8980 - loss: 0.0882
Epoch 2/2
[1m4987/4987[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 25ms/step - accuracy: 0.9869 - loss: 0.0551


<keras.src.callbacks.history.History at 0x31ef2f510>

In [16]:
model.summary()

In [18]:
model.save_weights("new_model_BiDir_LSTM.weights.h5")

In [19]:
import h5py as h5
f = h5.File("./new_model_BiDir_LSTM.weights.h5", "w")

In [20]:
f

<HDF5 file "new_model_BiDir_LSTM.weights.h5" (mode r+)>

In [None]:
# evaluate loaded model on test data
#model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.compile(optimizer='sgd', loss='categorical_crossentropy',metrics=['accuracy'])


In [5]:
score = model.evaluate(X_t, y, verbose=0)


In [6]:
score

[0.04831726849079132, 0.9939212203025818]

In [23]:
# Example new text
new_text = ""

# Preprocess the new text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=maxlen)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
for i, class_name in enumerate(list_classes):
    print(f'{class_name}: {predictions[0][i]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
toxic: 0.9995021224021912
severe_toxic: 0.5920392274856567
obscene: 0.9957861304283142
threat: 0.009605024009943008
insult: 0.9587054252624512
identity_hate: 0.16442282497882843


In [28]:
import numpy as np

# Path to the GloVe word vectors file
EMBEDDING_FILE = './glove.6B.50d.txt'

# Read the GloVe word vectors (space delimited strings) into a dictionary from word->vector.
def load_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

# Load the GloVe word vectors
embeddings_index = load_embeddings(EMBEDDING_FILE)

# Example new text
new_text = "This is a new text"

# Preprocess the new text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=maxlen)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
for i, class_name in enumerate(list_classes):
    print(f'{class_name}: {predictions[0][i]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
toxic: 0.0088625093922019
severe_toxic: 7.812467083567753e-05
obscene: 0.002116367919370532
threat: 8.311592682730407e-05
insult: 0.0014033680781722069
identity_hate: 0.00010831199324456975


In [24]:
# Example new text
new_text = "You are adorable, beautiful and great personality"

# Preprocess the new text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=maxlen)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
for i, class_name in enumerate(list_classes):
    print(f'{class_name}: {predictions[0][i]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
toxic: 0.0660712867975235
severe_toxic: 0.0004642491403501481
obscene: 0.010510834865272045
threat: 0.000436986651038751
insult: 0.012644685804843903
identity_hate: 0.0007772337994538248


In [26]:
# Example new text
new_text = "This is a new text."

# Preprocess the new text
sequence = tokenizer.texts_to_sequences([new_text])
padded_sequence = pad_sequences(sequence, maxlen=maxlen)

# Make predictions
predictions = model.predict(padded_sequence)

# Interpret the predictions
for i, class_name in enumerate(list_classes):
    print(f'{class_name}: {predictions[0][i]}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
toxic: 0.0088625093922019
severe_toxic: 7.812467083567753e-05
obscene: 0.002116367919370532
threat: 8.311592682730407e-05
insult: 0.0014033680781722069
identity_hate: 0.00010831199324456975


In [15]:
model.summary()