In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay

class ArabicSentimentClassifier:
    def __init__(self, max_len=100, embedding_dim=300):
        self.max_len = max_len
        self.embedding_dim = embedding_dim
        self.tokenizer = Tokenizer()
        self.model = None
        self.embedding_matrix = None

    def load_data(self, csv_path):
        df = pd.read_csv(csv_path)
        df['Type'] = pd.to_numeric(df['Type'], errors='coerce')
        df.dropna(subset=['Type'], inplace=True)
        self.texts = df['Comment'].astype(str).values
        self.labels = df['Type'].values
        return self.texts, self.labels

    def preprocess(self):
        # Tokenize and pad
        self.tokenizer.fit_on_texts(self.texts)
        sequences = self.tokenizer.texts_to_sequences(self.texts)
        self.word_index = self.tokenizer.word_index
        padded = pad_sequences(sequences, maxlen=self.max_len)
        return padded

    def load_fasttext(self, fasttext_path):
        print("Loading FastText embeddings...")
        embeddings_index = {}
        with open(fasttext_path, encoding='utf-8') as f:
            for line in f:
                values = line.rstrip().split(' ')
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        num_words = len(self.word_index) + 1
        self.embedding_matrix = np.zeros((num_words, self.embedding_dim))
        for word, i in self.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                self.embedding_matrix[i] = embedding_vector

        return num_words

    def build_model(self, num_words, trainable=True):
        input_layer = Input(shape=(self.max_len,))
        embedding_layer = Embedding(
            input_dim=num_words,
            output_dim=self.embedding_dim,
            weights=[self.embedding_matrix],
            input_length=self.max_len,
            trainable=trainable
        )(input_layer)

        x = Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.3))(embedding_layer)
        x = Dropout(0.5)(x)
        output = Dense(3, activation='softmax')(x)  # Changed here

        self.model = Model(inputs=input_layer, outputs=output)
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  # Changed here
        print(self.model.summary())


    def train(self, X, y, val_split=0.1, batch_size=128, epochs=15):
        early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(y),
            y=y
        )
        class_weights_dict = dict(enumerate(class_weights))

        history = self.model.fit(
            X, y,
            validation_split=val_split,
            batch_size=batch_size,
            epochs=epochs,
            callbacks=[early_stop],
            class_weight=class_weights_dict
        )

        self.plot_history(history)

    def cross_validate(self, X, y, folds=5, batch_size=128, epochs=10):
        skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=20)
        reports = []

        fold_no = 1
        for train_idx, val_idx in skf.split(X, y):
            print(f"\nFold {fold_no}/{folds}")

            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            # Rebuild model for each fold
            self.build_model(num_words=len(self.word_index) + 1, trainable=True)

            early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=0)

            class_weights = class_weight.compute_class_weight(
                class_weight='balanced',
                classes=np.unique(y_train),
                y=y_train
            )
            class_weights_dict = dict(enumerate(class_weights))

            history = self.model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=epochs,
                batch_size=batch_size,
                callbacks=[early_stop],
                class_weight=class_weights_dict,
                verbose=0
            )

            # Predict and report
            y_pred_probs = self.model.predict(X_val) # get predicted probabilities
            y_pred = np.argmax(y_pred_probs, axis=1) # convert probabilities to class labels
            report = classification_report(y_val, y_pred, output_dict=True, zero_division=0)
            reports.append(report)

            print(classification_report(y_val, y_pred, zero_division=0))
            fold_no += 1

        return reports


classifier = ArabicSentimentClassifier(max_len=100, embedding_dim=300)
texts, labels = classifier.load_data('dataset.csv')
X = classifier.preprocess()
y = np.array(labels)
num_words = classifier.load_fasttext('/content/cc.ar.300.vec')
reports = classifier.cross_validate(X, y, folds=5, batch_size=128, epochs=15)
f1_scores = [r['1.0']['f1-score'] if '1.0' in r else 0 for r in reports]
y_pred = classifier.model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)

print(f"\nMaximum F1-score (toxic): {np.max(f1_scores):.4f}")
print(f"\nMaximum Accuracy (toxic): {np.max(f1_scores):.4f}")
# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Non-Toxic", "Toxic"])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.grid(False)
plt.show()