In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando el dispositivo: {device}")

PyTorch version: 2.5.1+cu121
CUDA available: True
Device: NVIDIA RTX 2000 Ada Generation Laptop GPU
Usando el dispositivo: cuda


In [3]:
# Asegúrate de tener el archivo phishing_dataset.csv en el directorio actual
df = pd.read_csv("../data/Phishing_Email.csv")  # Reemplaza con la ruta de tu dataset
print("Dataset cargado con éxito.")
df.head()

Dataset cargado con éxito.


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [4]:
import pandas as pd

class DataPreprocessor:
    def __init__(self, file_path, columns_to_keep=None):
        """
        Inicializa el preprocesador con la ruta al archivo y las columnas a conservar.
        :param file_path: Ruta al archivo CSV.
        :param columns_to_keep: Lista de columnas importantes a conservar.
        """
        self.file_path = file_path
        self.columns_to_keep = columns_to_keep
        self.df = None

    def load_dataset(self):
        """
        Carga el dataset desde el archivo CSV.
        """
        self.df = pd.read_csv(self.file_path)
        print("Dataset cargado con éxito.")

    def keep_columns(self):
        """
        Conserva solo las columnas especificadas en `columns_to_keep`.
        """
        if self.columns_to_keep:
            self.df = self.df[self.columns_to_keep]
            print(f"Columnas conservadas: {list(self.df.columns)}")
        else:
            print("No se especificaron columnas para conservar.")

    def handle_missing_values(self):
        """
        Maneja los valores nulos reemplazándolos con cadenas vacías.
        """
        self.df["Email Text"] = self.df["Email Text"].fillna("")
        print(f"Valores nulos en 'Email Text': {self.df['Email Text'].isnull().sum()}")

    def convert_to_strings(self):
        """
        Convierte todos los valores de la columna `Email Text` a cadenas de texto.
        """
        self.df["Email Text"] = self.df["Email Text"].astype(str)
        print("Todos los valores de 'Email Text' convertidos a cadenas.")

    def remove_duplicates(self):
        """
        Elimina registros duplicados en el dataset.
        """
        initial_count = len(self.df)
        self.df = self.df.drop_duplicates()
        final_count = len(self.df)
        print(f"Registros duplicados eliminados: {initial_count - final_count}")

    def filter_short_texts(self, min_length=5):
        """
        Filtra textos demasiado cortos en la columna `Email Text`.
        :param min_length: Longitud mínima para conservar un texto.
        """
        initial_count = len(self.df)
        self.df = self.df[self.df["Email Text"].apply(len) > min_length]
        final_count = len(self.df)
        print(f"Registros eliminados por textos cortos: {initial_count - final_count}")

    def validate_labels(self):
        """
        Verifica y muestra la distribución de las etiquetas en `Email Type`.
        """
        label_counts = self.df["Email Type"].value_counts()
        print("Distribución de etiquetas:")
        print(label_counts)

    def preprocess(self):
        """
        Ejecuta todos los pasos de preprocesamiento en secuencia.
        """
        self.load_dataset()
        self.keep_columns()
        self.handle_missing_values()
        self.convert_to_strings()
        self.remove_duplicates()
        self.filter_short_texts()
        self.validate_labels()
        print("Preprocesamiento completado.")

    def get_dataset(self):
        """
        Devuelve el dataset preprocesado.
        :return: DataFrame preprocesado.
        """
        return self.df

In [5]:
# Uso del preprocesador

file_path = "../data/Phishing_Email.csv"
columns_to_keep = ["Email Text", "Email Type"]

preprocessor = DataPreprocessor(file_path, columns_to_keep)
preprocessor.preprocess()

# Obtener el dataset preprocesado
cleaned_dataset = preprocessor.get_dataset()
print("Dataset final listo para su uso.")
print(cleaned_dataset.head())

Dataset cargado con éxito.
Columnas conservadas: ['Email Text', 'Email Type']
Valores nulos en 'Email Text': 0
Todos los valores de 'Email Text' convertidos a cadenas.
Registros duplicados eliminados: 1111
Registros eliminados por textos cortos: 5
Distribución de etiquetas:
Email Type
Safe Email        10979
Phishing Email     6555
Name: count, dtype: int64
Preprocesamiento completado.
Dataset final listo para su uso.
                                          Email Text      Email Type
0  re : 6 . 1100 , disc : uniformitarianism , re ...      Safe Email
1  the other side of * galicismos * * galicismo *...      Safe Email
2  re : equistar deal tickets are you still avail...      Safe Email
3  \nHello I am your hot lil horny toy.\n    I am...  Phishing Email
4  software at incredibly low prices ( 86 % lower...  Phishing Email


In [6]:
# Guardar el dataset preprocesado

cleaned_dataset.to_csv("../data/cleaned_phishing_email.csv", index=False)
print("Dataset guardado en cleaned_phishing_email.csv.")

Dataset guardado en cleaned_phishing_email.csv.


In [7]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import PreTrainedTokenizerBase

class DatasetProcessor:
    def __init__(self, dataframe, tokenizer, text_column="Email Text", label_column="Email Type", max_length=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.text_column = text_column
        self.label_column = label_column
        self.max_length = max_length
        self.dataset_dict = None

    def split_dataset(self, test_size=0.2, validation_size=0.1, random_state=42):
        train, test = train_test_split(self.dataframe, test_size=test_size, random_state=random_state)
        train, validation = train_test_split(train, test_size=validation_size, random_state=random_state)

        self.dataset_dict = DatasetDict({
            "train": Dataset.from_pandas(train),
            "validation": Dataset.from_pandas(validation),
            "test": Dataset.from_pandas(test),
        })
        print("Dataset dividido en train, validation y test.")

    def tokenize(self):
        if not self.dataset_dict:
            raise ValueError("Primero debes dividir el dataset usando `split_dataset`.")

        def tokenize_function(examples):
            tokens = self.tokenizer(examples[self.text_column], padding="max_length", truncation=True, max_length=self.max_length)
            tokens["labels"] = [1 if label == "Phishing Email" else 0 for label in examples[self.label_column]]
            return tokens

        self.dataset_dict = self.dataset_dict.map(tokenize_function, batched=True)
        print("Tokenización completada.")

    def get_datasets(self):
        if not self.dataset_dict:
            raise ValueError("Primero debes dividir y tokenizar el dataset.")
        return self.dataset_dict

In [8]:
# Uso de DatasetProcessor

from transformers import BertTokenizer

# Ruta del CSV preprocesado
preprocessed_file_path = "../data/cleaned_phishing_email.csv"

# Carga y preparación del dataset preprocesado
preprocessor = DataPreprocessor(preprocessed_file_path)
preprocessor.preprocess()
cleaned_dataset = preprocessor.get_dataset()

# Inicializar tokenizador
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Procesamiento y tokenización
processor = DatasetProcessor(cleaned_dataset, tokenizer)
processor.split_dataset()
processor.tokenize()

# Obtener datasets finales
tokenized_datasets = processor.get_datasets()
print(tokenized_datasets)

Dataset cargado con éxito.
No se especificaron columnas para conservar.
Valores nulos en 'Email Text': 0
Todos los valores de 'Email Text' convertidos a cadenas.
Registros duplicados eliminados: 0
Registros eliminados por textos cortos: 0
Distribución de etiquetas:
Email Type
Safe Email        10979
Phishing Email     6555
Name: count, dtype: int64
Preprocesamiento completado.
Dataset dividido en train, validation y test.


Map: 100%|██████████| 12624/12624 [01:22<00:00, 152.65 examples/s]
Map: 100%|██████████| 1403/1403 [00:04<00:00, 288.78 examples/s]
Map: 100%|██████████| 3507/3507 [00:14<00:00, 246.95 examples/s]

Tokenización completada.
DatasetDict({
    train: Dataset({
        features: ['Email Text', 'Email Type', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 12624
    })
    validation: Dataset({
        features: ['Email Text', 'Email Type', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1403
    })
    test: Dataset({
        features: ['Email Text', 'Email Type', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3507
    })
})





In [9]:
print(tokenized_datasets["train"].features)

{'Email Text': Value(dtype='string', id=None), 'Email Type': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None)}


In [10]:
# Guardar los datasets tokenizados

tokenized_datasets.save_to_disk("../data/tokenized_phishing_email")
print("Datasets tokenizados guardados en tokenized_phishing_email.")

Saving the dataset (1/1 shards): 100%|██████████| 12624/12624 [00:00<00:00, 276398.82 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1403/1403 [00:00<00:00, 182389.30 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3507/3507 [00:00<00:00, 226535.82 examples/s]

Datasets tokenizados guardados en tokenized_phishing_email.





In [11]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class ModelTrainer:
    def __init__(self, tokenized_datasets, tokenizer, model_name="bert-base-uncased", num_labels=2):
        """
        Inicializa el entrenador del modelo.
        :param tokenized_datasets: DatasetDict tokenizado.
        :param tokenizer: Tokenizador utilizado.
        :param model_name: Nombre del modelo preentrenado.
        :param num_labels: Número de etiquetas para la clasificación.
        """
        self.tokenized_datasets = tokenized_datasets
        self.tokenizer = tokenizer
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def compute_metrics(self, eval_pred):
        """
        Calcula métricas de evaluación (accuracy, precision, recall, F1).
        """
        logits, labels = eval_pred
        predictions = logits.argmax(axis=-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
        acc = accuracy_score(labels, predictions)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    def train(self, output_dir="./results", num_train_epochs=3, learning_rate=2e-5, batch_size=16):
        """
        Configura y entrena el modelo.
        """
        training_args = TrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size * 2,
            num_train_epochs=num_train_epochs,
            weight_decay=0.01,
            logging_dir="./logs",
            load_best_model_at_end=True,
            save_total_limit=2
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["validation"],
            compute_metrics=self.compute_metrics,
        )

        print("Iniciando el entrenamiento...")
        self.trainer.train()
        print("Entrenamiento completado.")

    def evaluate(self):
        """
        Evalúa el modelo en el conjunto de prueba.
        """
        if not hasattr(self, "trainer"):
            raise ValueError("El modelo no ha sido entrenado aún. Ejecuta `train()` primero.")
        print("Evaluando el modelo...")
        results = self.trainer.evaluate(self.tokenized_datasets["test"])
        print("Resultados de evaluación:", results)
        return results

    def save_model(self, output_dir="./trained_model"):
        """
        Guarda el modelo entrenado.
        """
        print(f"Guardando el modelo en {output_dir}...")
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)
        print("Modelo guardado con éxito.")

In [12]:
trainer = ModelTrainer(tokenized_datasets, tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer.train()



Iniciando el entrenamiento...


 21%|██        | 500/2367 [02:03<07:45,  4.01it/s]

{'loss': 0.1492, 'grad_norm': 15.595269203186035, 'learning_rate': 1.5775242923531897e-05, 'epoch': 0.63}


                                                  
 33%|███▎      | 789/2367 [03:21<06:37,  3.97it/s]

{'eval_loss': 0.08500158786773682, 'eval_accuracy': 0.9800427655024947, 'eval_precision': 0.9906191369606003, 'eval_recall': 0.9582577132486388, 'eval_f1': 0.974169741697417, 'eval_runtime': 6.6329, 'eval_samples_per_second': 211.522, 'eval_steps_per_second': 6.634, 'epoch': 1.0}


 42%|████▏     | 1001/2367 [04:16<05:43,  3.98it/s] 

{'loss': 0.0572, 'grad_norm': 0.030597150325775146, 'learning_rate': 1.1550485847063794e-05, 'epoch': 1.27}


 63%|██████▎   | 1500/2367 [06:23<03:48,  3.80it/s]

{'loss': 0.0256, 'grad_norm': 33.70209884643555, 'learning_rate': 7.325728770595691e-06, 'epoch': 1.9}


                                                   
 67%|██████▋   | 1578/2367 [06:50<03:26,  3.82it/s]

{'eval_loss': 0.05917755886912346, 'eval_accuracy': 0.9900213827512473, 'eval_precision': 0.9872958257713249, 'eval_recall': 0.9872958257713249, 'eval_f1': 0.9872958257713249, 'eval_runtime': 6.8243, 'eval_samples_per_second': 205.589, 'eval_steps_per_second': 6.448, 'epoch': 2.0}


 84%|████████▍ | 2000/2367 [08:42<01:33,  3.91it/s]

{'loss': 0.0129, 'grad_norm': 0.008272156119346619, 'learning_rate': 3.1009716941275882e-06, 'epoch': 2.53}


                                                   
100%|██████████| 2367/2367 [10:27<00:00,  3.81it/s]

{'eval_loss': 0.07192473113536835, 'eval_accuracy': 0.9878831076265147, 'eval_precision': 0.989010989010989, 'eval_recall': 0.9800362976406534, 'eval_f1': 0.9845031905195989, 'eval_runtime': 6.8182, 'eval_samples_per_second': 205.773, 'eval_steps_per_second': 6.453, 'epoch': 3.0}


100%|██████████| 2367/2367 [10:29<00:00,  3.76it/s]

{'train_runtime': 629.8208, 'train_samples_per_second': 60.131, 'train_steps_per_second': 3.758, 'train_loss': 0.053230527890700555, 'epoch': 3.0}
Entrenamiento completado.





In [14]:
evaluation_results = trainer.evaluate()

Evaluando el modelo...


100%|██████████| 110/110 [00:16<00:00,  6.70it/s]

Resultados de evaluación: {'eval_loss': 0.06474976986646652, 'eval_accuracy': 0.9860279441117764, 'eval_precision': 0.9801375095492743, 'eval_recall': 0.9823889739663093, 'eval_f1': 0.9812619502868068, 'eval_runtime': 16.8384, 'eval_samples_per_second': 208.274, 'eval_steps_per_second': 6.533, 'epoch': 3.0}





In [15]:
trainer.save_model()

Guardando el modelo en ./trained_model...
Modelo guardado con éxito.


In [16]:
class PhishingPredictor:
    def __init__(self, model_path="./trained_model", tokenizer_path="./trained_model", max_length=128):
        """
        Inicializa el predictor con el modelo y tokenizador guardados.
        :param model_path: Ruta al modelo guardado
        :param tokenizer_path: Ruta al tokenizador guardado
        :param max_length: Longitud máxima de la secuencia
        """
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()

    def preprocess_text(self, text):
        """
        Preprocesa el texto para la predicción.
        :param text: Texto del email a analizar
        :return: Tensores de entrada procesados
        """
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.to(self.device) for k, v in inputs.items()}

    def predict(self, text):
        """
        Realiza la predicción para un texto dado.
        :param text: Texto del email a analizar
        :return: Diccionario con la predicción y la probabilidad
        """
        inputs = self.preprocess_text(text)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
            prediction = torch.argmax(probabilities, dim=-1).item()
            confidence = probabilities[0][prediction].item()

        return {
            "is_phishing": bool(prediction),
            "confidence": confidence,
            "prediction": "Phishing Email" if prediction else "Safe Email",
            "probability_phishing": probabilities[0][1].item(),
            "probability_safe": probabilities[0][0].item()
        }

    def predict_batch(self, texts):
        """
        Realiza predicciones para una lista de textos.
        :param texts: Lista de textos de emails
        :return: Lista de predicciones
        """
        return [self.predict(text) for text in texts]

In [17]:
import os

# Obtener la ruta absoluta al directorio del modelo que está en notebooks/trained_model
current_dir = os.path.dirname(os.path.abspath("__file__"))
model_path = os.path.join(current_dir, "trained_model")

print(f"Ruta del modelo: {model_path}")

# Crear instancia del predictor con la ruta correcta
predictor = PhishingPredictor(
    model_path=model_path,
    tokenizer_path=model_path
)

# Ejemplos de emails para probar
test_emails = [
    """Dear valued customer, Your account has been suspended. 
    Click here immediately to verify your identity: http://suspicious-link.com""",
    
    """Hi team, Here's the agenda for tomorrow's meeting:
    1. Project updates
    2. Budget review
    3. Q&A session
    Best regards, John""",
    
    """URGENT: You've won $1,000,000! Send your bank details 
    to claim your prize now!!!"""
]

# Realizar predicciones
print("Analizando emails de prueba...\n")
for i, email in enumerate(test_emails, 1):
    result = predictor.predict(email)
    print(f"Email #{i}:")
    print(f"Texto: {email[:100]}...")
    print(f"Predicción: {result['prediction']}")
    print(f"Confianza: {result['confidence']*100:.2f}%")
    print(f"Probabilidad de phishing: {result['probability_phishing']*100:.2f}%")
    print(f"Probabilidad de seguro: {result['probability_safe']*100:.2f}%")
    print("-" * 80 + "\n")

Ruta del modelo: c:\Users\infan\OneDrive\Desktop\AIR\AIntelligence\AIProjects\PhishingMailDetection\notebooks\trained_model
Analizando emails de prueba...

Email #1:
Texto: Dear valued customer, Your account has been suspended. 
    Click here immediately to verify your id...
Predicción: Phishing Email
Confianza: 99.91%
Probabilidad de phishing: 99.91%
Probabilidad de seguro: 0.09%
--------------------------------------------------------------------------------

Email #2:
Texto: Hi team, Here's the agenda for tomorrow's meeting:
    1. Project updates
    2. Budget review
    3...
Predicción: Safe Email
Confianza: 99.98%
Probabilidad de phishing: 0.02%
Probabilidad de seguro: 99.98%
--------------------------------------------------------------------------------

Email #3:
Texto: URGENT: You've won $1,000,000! Send your bank details 
    to claim your prize now!!!...
Predicción: Phishing Email
Confianza: 99.95%
Probabilidad de phishing: 99.95%
Probabilidad de seguro: 0.06%
------------