In [2]:
from tqdm import tqdm
import gc

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Conv2D, BatchNormalization, MaxPool2D,
    Dense, Dropout, Flatten
)

from transformers import DistilBertTokenizer, DistilBertModel

# Set random seeds for reproducibility
random_state = 42
np.random.seed(random_state)
torch.manual_seed(random_state)
tf.random.set_seed(random_state)

In [3]:
# Check GPU availability
if tf.config.list_physical_devices('GPU'):
    print("GPU is available. Using GPU for computation.")
    tf.config.experimental.set_memory_growth(tf.config.list_physical_devices('GPU')[0], True)
else:
    print("GPU is not available. Using CPU for computation.")

GPU is not available. Using CPU for computation.


In [4]:
def load_and_preprocess_data(file_path):
    batch_size = 5000
    dataframe = pd.read_csv(file_path)

    texts = dataframe['content'].tolist()
    labels = dataframe['majority_voting'].tolist()
    original_labels = dataframe['label'].tolist()

    label_encoder = LabelEncoder()

    num_labels = len(labels)
    num_batches = (num_labels + batch_size - 1) // batch_size

    all_labels = []
    all_original_labels = []
    for i in tqdm(range(num_batches)):
        batch_labels = labels[i * batch_size:(i + 1) * batch_size]
        batch_original_labels = original_labels[i * batch_size:(i + 1) * batch_size]

        encoded_labels = label_encoder.fit_transform(batch_labels)
        encoded_original_labels = label_encoder.fit_transform(batch_original_labels)

        all_labels.append(encoded_labels)
        all_original_labels.append(encoded_original_labels)

        gc.collect()
        if tf.config.list_physical_devices('GPU'):
            tf.keras.backend.clear_session()

    return texts, np.concatenate(all_labels, axis=0), np.concatenate(all_original_labels, axis=0)

texts, labels, original_labels = load_and_preprocess_data('datasets/ISOT/merged/dataset_labeled.csv')

100%|██████████| 9/9 [00:01<00:00,  5.42it/s]


In [5]:
def extract_embeddings(texts, batch_size=200):
    tokenizer = DistilBertTokenizer.from_pretrained(
        'distilbert-base-uncased-finetuned-sst-2-english'
    )
    model = DistilBertModel.from_pretrained(
        'distilbert-base-uncased-finetuned-sst-2-english'
    )

    num_texts = len(texts)
    num_batches = (num_texts + batch_size - 1) // batch_size

    all_embeddings = []

    for i in tqdm(range(num_batches)):
        batch_texts = texts[i * batch_size:(i + 1) * batch_size]

        encodings = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = model(
                input_ids=encodings['input_ids'],
                attention_mask=encodings['attention_mask']
            )
            batch_embeddings = outputs.last_hidden_state.numpy()

        all_embeddings.append(batch_embeddings)

        gc.collect()
        if tf.config.list_physical_devices('GPU'):
            tf.keras.backend.clear_session()

    return np.concatenate(all_embeddings, axis=0)

embeddings = extract_embeddings(texts)
print("Embedding shape:", embeddings.shape)

  2%|▏         | 4/225 [09:25<8:41:10, 141.50s/it]


KeyboardInterrupt: 

In [58]:
def create_cnn_model(input_shape, num_classes):
    model = Sequential()

    # First Convolutional Block
    model.add(Conv2D(256, (3, 3), strides=1, padding="same", activation="relu",
                     input_shape=input_shape))
    model.add(BatchNormalization())

    # Second Convolutional Block
    model.add(Conv2D(256, (3, 3), strides=1, padding="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPool2D((2, 2), strides=2, padding="valid"))

    # Third Convolutional Block
    model.add(Conv2D(128, (3, 3), strides=1, padding="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPool2D((2, 2), strides=2, padding="valid"))

    # Fourth Convolutional Block
    model.add(Conv2D(128, (3, 3), strides=1, padding="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPool2D((2, 2), strides=2, padding="valid"))

    # Dense Layers
    model.add(Dense(units=256, activation="relu"))
    model.add(Dropout(0.3))
    model.add(BatchNormalization())
    model.add(Dense(units=128, activation="relu"))
    model.add(BatchNormalization())
    model.add(Flatten())
    model.add(Dense(units=num_classes, activation="softmax"))

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

In [59]:
def cross_validate(X, y, num_classes, original_labels, k_folds=5):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=random_state)

    fold_metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': []
    }

    detailed_results = []

    # Reshape embeddings for 2D convolution
    X_2d = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_2d, y), 1):
        print(f"\nFold {fold}")

        X_train, X_val = X_2d[train_idx], X_2d[val_idx]
        y_train, y_val = y[train_idx], original_labels[val_idx]

        model = create_cnn_model(
            input_shape=(X_train.shape[1], X_train.shape[2], 1),
            num_classes=num_classes
        )

        model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=10,
            batch_size=32,
            verbose=1
        )

        y_pred = model.predict(X_val)
        y_pred_classes = np.argmax(y_pred, axis=1)

        accuracy = accuracy_score(y_val, y_pred_classes)
        precision = precision_score(y_val, y_pred_classes, average='weighted')
        recall = recall_score(y_val, y_pred_classes, average='weighted')
        f1 = f1_score(y_val, y_pred_classes, average='weighted')

        fold_metrics['accuracy'].append(accuracy)
        fold_metrics['precision'].append(precision)
        fold_metrics['recall'].append(recall)
        fold_metrics['f1_score'].append(f1)

        detailed_results.append({
            'fold': fold,
            'y_true': y_val,
            'y_pred': y_pred_classes,
            'metrics': {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1_score': f1
            }
        })

        print(classification_report(y_val, y_pred_classes))

    print("\nCross-Validation Summary:")
    for metric, values in fold_metrics.items():
        print(f"{metric.capitalize()} - Mean: {np.mean(values):.4f}, Std: {np.std(values):.4f}")

    return fold_metrics, detailed_results

num_classes = len(np.unique(labels))
metrics, results = cross_validate(embeddings, labels, num_classes, original_labels)


Fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


ResourceExhaustedError: Graph execution error:

Detected at node gradient_tape/sequential_1/batch_normalization_2_1/moments/BroadcastTo defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\tornado\platform\asyncio.py", line 205, in start

  File "C:\Python311\Lib\asyncio\base_events.py", line 604, in run_forever

  File "C:\Python311\Lib\asyncio\base_events.py", line 1909, in _run_once

  File "C:\Python311\Lib\asyncio\events.py", line 80, in _run

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\kernelbase.py", line 545, in dispatch_queue

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\kernelbase.py", line 534, in process_one

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\kernelbase.py", line 437, in dispatch_shell

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\ipkernel.py", line 362, in execute_request

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\kernelbase.py", line 778, in execute_request

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\ipkernel.py", line 449, in do_execute

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\ipykernel\zmqshell.py", line 549, in run_cell

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3075, in run_cell

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3130, in _run_cell

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\async_helpers.py", line 128, in _pseudo_sync_runner

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3334, in run_cell_async

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3517, in run_ast_nodes

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code

  File "C:\Users\pc cam\AppData\Local\Temp\ipykernel_21996\2384889723.py", line 80, in <module>

  File "C:\Users\pc cam\AppData\Local\Temp\ipykernel_21996\2384889723.py", line 32, in cross_validate

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\keras\src\utils\traceback_utils.py", line 117, in error_handler

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 320, in fit

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 121, in one_step_on_iterator

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 108, in one_step_on_data

  File "C:\Users\pc cam\Desktop\FND\paper PFE 2023\UFNAC\ufnac_venv\Lib\site-packages\keras\src\backend\tensorflow\trainer.py", line 70, in train_step

OOM when allocating tensor with shape[8,256,384,128] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node gradient_tape/sequential_1/batch_normalization_2_1/moments/BroadcastTo}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_one_step_on_iterator_5456]

In [None]:
def plot_cross_validation_results(metrics):
    plt.figure(figsize=(12, 6))

    for i, (metric, values) in enumerate(metrics.items(), 1):
        plt.subplot(2, 2, i)
        plt.bar(range(1, len(values) + 1), values)
        plt.title(f"{metric.capitalize()} per Fold")
        plt.xlabel("Fold")
        plt.ylabel(metric.capitalize())

    plt.tight_layout()
    plt.show()

plot_cross_validation_results(metrics)