In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

# Step 1: Load the dataset
filename = 'all_facebook_and_twitter_dataset.xlsx'
df = pd.read_excel(filename)

# Step 2: Preprocess the dataset
comments = df['Comments'].astype(str)  # Convert comments to string type
labels = df['M-Class']

# Convert labels to integer using LabelEncoder
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Tokenize the comments and convert them to integer sequences
vocab_size = 10000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(comments)
comments_seq = tokenizer.texts_to_sequences(comments)

# Set maximum sequence length
max_length = max(len(seq) for seq in comments_seq)

# Pad the sequences to a consistent length
comments_padded = pad_sequences(comments_seq, maxlen=max_length, padding='post', truncating='post')

# Define the function to create the model
def create_model():
    embedding_dim = 100
    rnn_units = 64
    dropout_rate = 0.5
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        SimpleRNN(rnn_units, return_sequences=False),
        Dropout(dropout_rate),
        Dense(3, activation='softmax')
    ])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Define the function to train and evaluate the model using k-fold cross-validation
def k_fold_cross_validation(k):
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_no = 1
    results = {'accuracy': [], 'precision': [], 'recall': [], 'f1_score': []}

    for train_index, test_index in kfold.split(comments_padded):
        X_train, X_test = comments_padded[train_index], comments_padded[test_index]
        y_train, y_test = labels_encoded[train_index], labels_encoded[test_index]

        model = create_model()

        batch_size = 32
        epochs = 10
        early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[early_stopping], verbose=2)

        # Evaluate the model
        y_pred_probs = model.predict(X_test)
        y_pred_classes = np.argmax(y_pred_probs, axis=1)

        # Calculate classification report
        report = classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, output_dict=True, zero_division=1)

        # Extract individual metrics
        results['accuracy'].append(report['accuracy'])
        results['precision'].append(report['macro avg']['precision'])
        results['recall'].append(report['macro avg']['recall'])
        results['f1_score'].append(report['macro avg']['f1-score'])

        print(f"Results for fold {fold_no}")
        print(f"Accuracy: {report['accuracy']:.4f}")
        print(f"Precision: {report['macro avg']['precision']:.4f}")
        print(f"Recall: {report['macro avg']['recall']:.4f}")
        print(f"F1-Score: {report['macro avg']['f1-score']:.4f}")
        print("\n")
        fold_no += 1

    # Average the results
    avg_results = {metric: np.mean(scores) for metric, scores in results.items()}
    return avg_results

# Number of folds for k-fold cross-validation
k = 10

# Evaluate the model using k-fold cross-validation
avg_results = k_fold_cross_validation(k)

print(f"Average results after {k}-fold cross-validation:")
print(f"Accuracy: {avg_results['accuracy']:.4f}")
print(f"Precision: {avg_results['precision']:.4f}")
print(f"Recall: {avg_results['recall']:.4f}")
print(f"F1-Score: {avg_results['f1_score']:.4f}")


Epoch 1/10
302/302 - 19s - 62ms/step - accuracy: 0.4482 - loss: 0.9702 - val_accuracy: 0.4120 - val_loss: 0.9917
Epoch 2/10
302/302 - 22s - 73ms/step - accuracy: 0.4417 - loss: 1.0199 - val_accuracy: 0.4975 - val_loss: 0.9707
Epoch 3/10
302/302 - 20s - 65ms/step - accuracy: 0.4457 - loss: 0.9760 - val_accuracy: 0.4378 - val_loss: 0.9704
Epoch 4/10
302/302 - 14s - 48ms/step - accuracy: 0.4453 - loss: 0.9667 - val_accuracy: 0.4892 - val_loss: 0.9605
Epoch 5/10
302/302 - 14s - 47ms/step - accuracy: 0.4473 - loss: 0.9555 - val_accuracy: 0.4357 - val_loss: 0.9698
Epoch 6/10
302/302 - 14s - 45ms/step - accuracy: 0.4632 - loss: 0.9500 - val_accuracy: 0.5573 - val_loss: 0.9632
Epoch 7/10
302/302 - 15s - 48ms/step - accuracy: 0.4556 - loss: 0.9492 - val_accuracy: 0.3842 - val_loss: 0.9717
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Results for fold 1
Accuracy: 0.4772
Precision: 0.6497
Recall: 0.3473
F1-Score: 0.3310


Epoch 1/10
302/302 - 17s - 58ms/step - accuracy

Epoch 4/10
302/302 - 21s - 68ms/step - accuracy: 0.4576 - loss: 0.9651 - val_accuracy: 0.3820 - val_loss: 0.9783
Epoch 5/10
302/302 - 14s - 46ms/step - accuracy: 0.4552 - loss: 0.9498 - val_accuracy: 0.5168 - val_loss: 0.9685
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Results for fold 10
Accuracy: 0.4746
Precision: 0.6560
Recall: 0.3454
F1-Score: 0.2883


Average results after 10-fold cross-validation:
Accuracy: 0.4763
Precision: 0.7069
Recall: 0.3381
F1-Score: 0.2833
