In [9]:
pip install spacy textblob

Note: you may need to restart the kernel to use updated packages.
Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
     ---------------------------------------- 0.0/636.8 kB ? eta -:--:--
     - ------------------------------------- 30.7/636.8 kB 1.4 MB/s eta 0:00:01
     --------------- ---------------------- 256.0/636.8 kB 3.2 MB/s eta 0:00:01
     -------------------------------------  634.9/636.8 kB 5.7 MB/s eta 0:00:01
     -------------------------------------- 636.8/636.8 kB 5.0 MB/s eta 0:00:00
Collecting nltk>=3.1 (from textblob)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ----------- ---------------------------- 0.4/1.5 MB 8.7 MB/s eta 0:00:01
     ------------------------ --------------- 0.9/1.5 MB 9.9 MB/s eta 0:00:01
     ---------------------------------------  1.5/1.5 MB 10.6 MB/s eta 0:00:01
     ---------------------------------------- 1.5/1.5 MB 10.6 MB/s 



In [2]:
pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.0-py3-none-any.whl.metadata (5.3 kB)
Downloading seaborn-0.13.0-py3-none-any.whl (294 kB)
   ---------------------------------------- 0.0/294.6 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/294.6 kB ? eta -:--:--
   ---------------------------------------- 294.6/294.6 kB 4.6 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.8.2-cp311-cp311-win_amd64.whl.metadata (5.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.2.0-cp311-cp311-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.46.0-cp311-cp311-win_amd64.whl.metadata (159 kB)
     ---------------------------------------- 0.0/159.4 kB ? eta -:--:--
     ---------------------- ---------------- 92.2/159.4 kB 2.6 MB/s eta 0:00:01
     -------------------------------------- 159.4/159.4 kB 4.8 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.5-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)
Downloading matplotlib-3.8.2-cp311-cp311-win_amd64.whl (7.6 MB)
   -----------------------



In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, SpatialDropout1D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from textblob import TextBlob
from tensorflow.keras.layers import Bidirectional, Dropout

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the dataset
df = pd.read_csv('cyberbullying_tweets.csv')

# Drop unnecessary columns (e.g., age, gender, religion, ethnicity)
df = df[['tweet_text', 'cyberbullying_type']]

# Split the data into training and testing sets
train_texts, test_texts, y_train, y_test = train_test_split(df['tweet_text'], df['cyberbullying_type'], test_size=0.2, random_state=42)

def classify_severity(tweet):
    # Perform sentiment analysis using TextBlob
    sentiment = TextBlob(tweet).sentiment

    # Use spaCy for tokenization and lemmatization
    doc = nlp(tweet.lower())
    tokens = [token.lemma_ for token in doc]

    # Custom rules for severity classification
    high_severity_keywords = ['hate', 'threat', 'violence']
    medium_severity_keywords = ['insult', 'harassment']

    if any(keyword in tokens for keyword in high_severity_keywords) or sentiment.polarity < -0.5:
        return 'High'
    elif any(keyword in tokens for keyword in medium_severity_keywords) or sentiment.polarity < 0:
        return 'Medium'
    else:
        return 'Low'

# Apply the classification function to the training and testing sets
y_train_pred = [classify_severity(tweet) for tweet in train_texts]
y_test_pred = [classify_severity(tweet) for tweet in test_texts]

# Preprocess the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_pred)
y_test_encoded = label_encoder.transform(y_test_pred)

# Tokenize and pad the text data for GRU model
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

x_train = pad_sequences(train_sequences, maxlen=max_len, padding='post', truncating='post')
x_test = pad_sequences(test_sequences, maxlen=max_len, padding='post', truncating='post')

# Build the GRU model
embedding_dim = 200
gru_units = 128

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(GRU(gru_units, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
model.add(Bidirectional(GRU(gru_units, dropout=0.5, recurrent_dropout=0.5)))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Implement a learning rate scheduler
def lr_scheduler(epoch, lr):
    return lr * 0.9  # Adjust the factor as needed

model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
lr_scheduler_callback = LearningRateScheduler(lr_scheduler)

# Define early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
epochs = 20  # Increase the number of epochs
batch_size = 64  # Adjust the batch size

history = model.fit(x_train, y_train_encoded, epochs=epochs, batch_size=batch_size,
                    validation_split=0.1, callbacks=[early_stopping, lr_scheduler_callback])

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test_encoded)
print(f'Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}')



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Test Loss: 0.3188, Test Accuracy: 0.8925
