In [None]:
!pip3 install ktrain
import ktrain
from ktrain import text

Collecting ktrain
  Downloading ktrain-0.41.1.tar.gz (25.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.3/25.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langdetect (from ktrain)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting syntok>1.3.3 (from ktrain)
  Downloading syntok-1.4.4-py3-none-any.whl (24 kB)
Collecting tika (from ktrain)
  Downloading tika-2.6.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<=4.37.2 (from ktrain)
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m106.0 MB/s[0m eta [36m0:00:00[0m
Collecting keras_bert>=0.86.0 (from ktrain)
  Downloa

# Bert

In [None]:
import pandas as pd
import numpy as np
import re
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("dataset.csv")

In [None]:
# Assuming X and y are your data and target variable, respectively
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)


In [None]:
X_train = data_train.comment.tolist()
X_test = data_test.comment.tolist()

y_train = data_train.isHate.tolist()
y_test = data_test.isHate.tolist()

data = pd.concat([data_train, data_test], ignore_index=True)

class_names = ['hate', 'non hate']

In [None]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350,
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


In [None]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [None]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

In [None]:
learner.fit_onecycle(2e-5, 5)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/5
Epoch 2/5
Epoch 3/5
 27/133 [=====>........................] - ETA: 55s - loss: 0.1512 - accuracy: 0.9506

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

        hate       0.80      0.86      0.83       129
    non hate       0.71      0.62      0.66        71

    accuracy                           0.78       200
   macro avg       0.76      0.74      0.75       200
weighted avg       0.77      0.78      0.77       200



array([[111,  18],
       [ 27,  44]])

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['hate', 'non hate']

In [None]:
import time

message = 'Well im glad that i live in Serbia, migrants and islamists are not welcome here!'

start_time = time.time()
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: non hate (0.10)


## Bert + LSTM (Hybrid)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, GlobalAveragePooling1D, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
import re

In [None]:
# Load the data
dataset = pd.read_csv('dataset.csv')

# Handle "sit" column containing string values
def clean_text(text):
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-English characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing steps
dataset['comment'] = dataset['comment'].apply(clean_text)

# Split data into features (X) and target (y)
X = dataset['comment']  # Assuming 'comment' contains your text data
y = dataset['isHate'].values



In [None]:
# Tokenize input text
max_len = 350  # Set your desired maximum sequence length
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_encoded = [tokenizer.encode(text, add_special_tokens=True, max_length=max_len, truncation=True) for text in X]

# Pad tokenized sequences to ensure uniform length
X_padded = pad_sequences(X_encoded, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define model
input_layer_text = Input(shape=(max_len,), dtype=np.int32)
bert_embedding = TFBertModel.from_pretrained('bert-base-uncased')(input_layer_text)[0]
lstm_layer = Bidirectional(LSTM(64, return_sequences=True))(bert_embedding)
pooled_output = GlobalAveragePooling1D()(lstm_layer)
output_layer = Dense(1, activation='sigmoid')(pooled_output)  # Output layer for binary classification

# Define model
model = Model(inputs=input_layer_text, outputs=output_layer)

# Print model summary
print(model.summary())


Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_29 (InputLayer)       [(None, 350)]             0         
                                                                 
 tf_bert_model_18 (TFBertMo  TFBaseModelOutputWithPo   109482240 
 del)                        olingAndCrossAttentions             
                             (last_hidden_state=(Non             
                             e, 350, 768),                       
                              pooler_output=(None, 7             
                             68),                                
                              past_key_values=None,              
                             hidden_states=None, att             
                             entions=None, cross_att             
                             entions=None)                       
                                                          

In [None]:
# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Evaluate the model on training data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print("Initial Training Loss:", train_loss)
print("Initial Training Accuracy:", train_accuracy)

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=6, validation_split=0.2, verbose=1)

Initial Training Loss: 0.6881103515625
Initial Training Accuracy: 0.602756917476654
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.round().astype(int)

# Get class labels
class_labels = ['hate', 'non hate']

# Print classification report
print(classification_report(y_test, y_pred_classes, target_names=class_labels))


              precision    recall  f1-score   support

        hate       0.65      1.00      0.78       129
    non hate       0.00      0.00      0.00        71

    accuracy                           0.65       200
   macro avg       0.32      0.50      0.39       200
weighted avg       0.42      0.65      0.51       200



In [None]:
import time

# Function for single prediction
def predict_single(message, model, tokenizer, max_len):
    # Clean text
    cleaned_text = clean_text(message)
    # Tokenize input text
    input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, max_length=max_len, truncation=True)
    # Pad tokenized sequence
    input_ids = pad_sequences([input_ids], maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")
    # Make prediction
    prediction = model.predict(input_ids)
    return prediction[0][0]  # Return the probability of the positive class

message = 'Well im glad that i live in Serbia, everyone are welcome here! '
start_time = time.time()
prediction = predict_single(message, model, tokenizer, max_len)

class_name = "hate" if prediction > 0.5 else "non hate"
print('predicted: {} ({:.2f})'.format(class_name, (time.time() - start_time)))


predicted: non hate (0.10)
