In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import keras_tuner as kt

In [None]:
def restructure_dataset(df,aggressive):
    df = df.copy()
    df.drop(axis=1,columns=['No.'],inplace=True)
    # df['Message'] = df['Message'].replace([NaN], pd.NA)
    df["Message"] = df["Message"].astype('string') 
    df['Message'] = df['Message'].replace({pd.NA: None})
    df = df.dropna(subset=['Message'])
    df["Cyberbullying"] = aggressive
    return df

In [2]:
aggressive_df = pd.read_csv("data/Aggressive.csv")
aggressive_df = restructure_dataset(aggressive_df,True)
aggressive_df

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying
...,...,...
47687,"Black ppl aren't expected to do anything, depe...",ethnicity
47688,Turner did not withhold his disappointment. Tu...,ethnicity
47689,I swear to God. This dumb nigger bitch. I have...,ethnicity
47690,Yea fuck you RT @therealexel: IF YOURE A NIGGE...,ethnicity


In [None]:
non_aggressive_df = pd.read_csv("data/Non_Aggressive.csv")
non_aggressive_df = restructure_dataset(non_aggressive_df,False)
non_aggressive_df

In [3]:
data = pd.concat([aggressive_df,non_aggressive_df])
data

### Data splitting

In [15]:
text_features = "Message"
target_feature = "Cyberbullying"

train_data, test_data = train_test_split(data, test_size=0.2, random_state=123)

X_train = train_data.drop(columns=target_feature)
y_train = train_data[target_feature]

X_test = test_data.drop(columns=target_feature)
y_test = test_data[target_feature]

### Data Preprocessing

In [16]:
max_text_length = 280

def text_to_sequence(texts):
    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=max_text_length)

In [17]:
X_train["Message"] = text_to_sequence(X_train["Message"])
X_test["Message"] = text_to_sequence(X_test["Message"])

### Model Building

In [8]:
def model_builder(hp):
    model = Sequential()
    model.add(Embedding(10000, output_dim=hp.Int('embedding_dim', 50, 200, step=50), input_length=max_text_length))
    model.add(LSTM(hp.Int('lstm_units', 32, 128, step=32), return_sequences=False))
    model.add(Dropout(hp.Float('dropout_rate', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

### Hypterparameter Optimization

In [9]:
tuner = kt.Hyperband(hypermodel=model_builder,
                     objective='val_accuracy',
                     max_epochs=5,
                     factor=3,
                     directory='rnn_results',
                     project_name='cyberbullying_detection')




### Model fitting

In [10]:
tuner.search(X_train, y_train, epochs=10, validation_split=0.2)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(best_hps.values)
best_model = tuner.hypermodel.build(best_hps)

best_model.fit(X_train, y_train, epochs=20, validation_split=0.2)

Trial 10 Complete [00h 09m 53s]
val_accuracy: 0.8620102405548096

Best val_accuracy So Far: 0.8669899106025696
Total elapsed time: 01h 03m 50s
Epoch 1/20




[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 71ms/step - accuracy: 0.8428 - loss: 0.3388 - val_accuracy: 0.8666 - val_loss: 0.2653
Epoch 2/20
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 71ms/step - accuracy: 0.9020 - loss: 0.2056 - val_accuracy: 0.8645 - val_loss: 0.2829
Epoch 3/20
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 65ms/step - accuracy: 0.9257 - loss: 0.1586 - val_accuracy: 0.8621 - val_loss: 0.3683
Epoch 4/20
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 64ms/step - accuracy: 0.9394 - loss: 0.1240 - val_accuracy: 0.8556 - val_loss: 0.4060
Epoch 5/20
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 65ms/step - accuracy: 0.9505 - loss: 0.0988 - val_accuracy: 0.8447 - val_loss: 0.4735
Epoch 6/20
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 65ms/step - accuracy: 0.9571 - loss: 0.0833 - val_accuracy: 0.8308 - val_loss: 0.5572
Epoch 7/20
[1m954/954[0m 

NameError: name 'X_test' is not defined

In [19]:
y_pred = best_model.predict(X_test)
print(classification_report(y_test, (y_pred > 0.5).astype(int)))

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step
              precision    recall  f1-score   support

       False       0.36      0.29      0.32      1519
        True       0.87      0.90      0.89      8020

    accuracy                           0.81      9539
   macro avg       0.62      0.60      0.60      9539
weighted avg       0.79      0.81      0.80      9539

