In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

Define classes

In [None]:
class CyberbullyingPreprocessor:
    """
    Preprocessor class to clean data
    """
    def __init__(self):
        pass

    def clean(self, text):
        """Lowercase and remove URLs, @mentions, hashtags, and punctuation."""
        text = text.lower()
        text = re.sub(r"http\S+|@\w+|#\w+", "", text)  # Remove URLs, mentions, hashtags
        text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
        return text

class FeedforwardNeuralNet:
    def __init__(self, input_size, hidden_size=64, learning_rate: float=0.01, epochs=100):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = 1  # binary classification
        self.lr = learning_rate
        self.epochs = epochs

        # Initialize weights and biases with small random values
        self.W1 = np.random.randn(self.input_size, self.hidden_size) * 0.01
        self.b1 = np.zeros((1, self.hidden_size))

        self.W2 = np.random.randn(self.hidden_size, self.output_size) * 0.01
        self.b2 = np.zeros((1, self.output_size))

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def sigmoid_derivative(self, a):
        return a * (1 - a)

    def relu(self, z):
        # ReLu function: g(x) = max(0, x)
        return np.maximum(0, z)

    def relu_derivative(self, z):
        # RELU derivative function is: for x > 0 = 1, for x < 0 = 0 
        return (z > 0).astype(float)

    def binary_cross_entropy(self, y_true, y_pred):
        epsilon = 1e-9  # small value to avoid log(0)
        return -np.mean(y_true * np.log(y_pred + epsilon) + (1 - y_true) * np.log(1 - y_pred + epsilon))

    def forward(self, X):
        """Forward pass: compute predictions."""
        self.Z1 = np.dot(X, self.W1) + self.b1
        self.A1 = self.relu(self.Z1)

        self.Z2 = np.dot(self.A1, self.W2) + self.b2
        self.A2 = self.sigmoid(self.Z2)

        return self.A2  # Probabilities between 0 and 1

    def backward(self, X, y_true, y_pred):
        """Backward pass: compute gradients and update weights."""
        m = X.shape[0]  # number of samples

        # Output layer gradients
        dZ2 = y_pred - y_true.reshape(-1, 1)
        dW2 = np.dot(self.A1.T, dZ2) / m
        db2 = np.sum(dZ2, axis=0, keepdims=True) / m

        # Hidden layer gradients
        dA1 = np.dot(dZ2, self.W2.T)
        dZ1 = dA1 * self.relu_derivative(self.Z1)
        dW1 = np.dot(X.T, dZ1) / m
        db1 = np.sum(dZ1, axis=0, keepdims=True) / m

        # Gradient descent updates
        self.W2 -= self.lr * dW2
        self.b2 -= self.lr * db2
        self.W1 -= self.lr * dW1
        self.b1 -= self.lr * db1

    def train(self, X, y):
        """Train the neural network using gradient descent."""
        for epoch in range(self.epochs):
            y_pred = self.forward(X)
            loss = self.binary_cross_entropy(y, y_pred)
            self.backward(X, y, y_pred)

            # if epoch % 10 == 0 or epoch == self.epochs - 1:
            #     print(f"Epoch {epoch} | Loss: {loss:.4f}")

    def predict(self, X):
        """Predict binary labels based on 0.5 threshold."""
        probs = self.forward(X)
        return (probs >= 0.5).astype(int)

    def evaluate(self, X, y):
        """Compute accuracy."""
        preds = self.predict(X)
        accuracy = np.mean(preds.flatten() == y)
        return accuracy
    
# Load your processed CSV from the previous script
filepath = "../data/preprocessing/output/"
filename = "train_subset_clean.csv"
# filename = "clean_data.csv"
df = pd.read_csv(filepath + filename)


In [18]:
# Clean text data
preprocessor = CyberbullyingPreprocessor()
df['clean_comment'] = df['comment'].astype(str).apply(preprocessor.clean)
X = df['clean_comment'].astype(str)

# Convert cleaned text to bag-of-words vectors
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_comment']).toarray()

# Convert DataFrame to 1D NumPy array
y = df[['toxic', 'threat', 'insult', 'discrimination']].sum(axis=1)
y = (y > 0).astype(int).values  # convert to binary and flatten to np.array

In [19]:
# train and split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the feedforward Neural network
model = FeedforwardNeuralNet(input_size=X.shape[1], hidden_size=32, learning_rate=0.1, epochs=100)
model.train(X_train, y_train)

# evaluate the model and generate accuracy scores
accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# display classification report
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Test Accuracy: 0.7598
              precision    recall  f1-score   support

           0       0.72      0.87      0.79        92
           1       0.82      0.64      0.72        87

    accuracy                           0.76       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.76      0.76       179

