# Clasificación usando TF-IDF con algoritmos de clasificación

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

# Leer el archivo Parquet en un DataFrame
df = pd.read_parquet('./eda/mapped_&_usecases_df.parquet')

In [3]:
# 3. Dividir los datos en conjunto de entrenamiento y prueba
# Establecer una semilla para la reproducibilidad
seed = 42
np.random.seed(seed)

# Dividir el DataFrame en dos partes iguales
df1, df2 = train_test_split(df, test_size=0.3, random_state=42)

# Verificar las dimensiones de las divisiones
print(f"Size of df1: {df1.shape}")
print(f"Size of df2: {df2.shape}")

print(df2.columns)

# Definir el tamaño de las particiones
train_size = 0.7  # Porcentaje para el conjunto de entrenamiento
val_size = 0.15   # Porcentaje para el conjunto de validación
test_size = 0.15  # Porcentaje para el conjunto de prueba

# Dividir el DataFrame en entrenamiento + validación y prueba
train_val_df, test_df = train_test_split(df2, test_size=test_size + val_size, random_state=42)

# Dividir el DataFrame de entrenamiento + validación en entrenamiento y validación
val_size_adjusted = val_size / (val_size + test_size)  # Ajuste para la proporción en el conjunto de entrenamiento/validación
train_df, val_df = train_test_split(train_val_df, test_size=val_size_adjusted, random_state=42)
               
# Verificar tamaños de los conjuntos
print(f"Tamaño del conjunto de entrenamiento: {train_df.shape}")
print(f"Tamaño del conjunto de validación: {val_df.shape}")
print(f"Tamaño del conjunto de prueba: {test_df.shape}")

# Definir el número de muestras que deseas seleccionar
num_train_samples = 12000
num_val_samples = 5000
num_test_samples = 500

# Muestrear X muestras directamente desde los DataFrames
train_df_sampled = train_df.sample(n=num_train_samples, replace=False, random_state=42)
val_df_sampled = val_df.sample(n=num_val_samples, replace=False, random_state=42)
test_df_sampled = test_df.sample(n=num_test_samples, replace=False, random_state=42)

# Verificar tamaños de los conjuntos
print(f"Tamaño del conjunto de entrenamiento sampleado: {train_df_sampled.shape}")
print(f"Tamaño del conjunto de validación sampleado: {val_df_sampled.shape}")
print(f"Tamaño del conjunto de prueba sampleado: {test_df_sampled.shape}")

Size of df1: (81096, 7)
Size of df2: (34756, 7)
Index(['source_code', 'slither_text', 'slither_label', 'use_cases',
       'vulnerability_mapping', 'vulnerability_keys', 'vulnerable'],
      dtype='object')
Tamaño del conjunto de entrenamiento: (12164, 7)
Tamaño del conjunto de validación: (12165, 7)
Tamaño del conjunto de prueba: (10427, 7)
Tamaño del conjunto de entrenamiento sampleado: (12000, 7)
Tamaño del conjunto de validación sampleado: (5000, 7)
Tamaño del conjunto de prueba sampleado: (500, 7)


In [6]:
# 1. Tokenización del código
def tokenize_code(code):
    # Expresión regular mejorada para capturar tokens específicos de Solidity
    tokens = re.findall(r'\b\w+\b|[{}()\[\];,=+\-*/<>!&|%^~]', code)
    return ' '.join(tokens)

# Aplicar tokenización
train_df_sampled['tokenized_code'] = train_df_sampled['source_code'].apply(tokenize_code)
test_df_sampled['tokenized_code'] = test_df_sampled['source_code'].apply(tokenize_code)


# 2. Separar características (X) y etiquetas (y)
X_train_sampled = train_df_sampled['tokenized_code']
X_test_sampled = test_df_sampled['tokenized_code']

y_train_sampled = train_df_sampled['vulnerable']
y_test_sampled = test_df_sampled['vulnerable']


In [7]:
# 4. Transformación TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train_sampled)
X_test_tfidf = tfidf.transform(X_test_sampled)

In [9]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
import os

# Crear el directorio de salida si no existe
output_dir = './outputs/tfidf/'
os.makedirs(output_dir, exist_ok=True)

# 1. Definición de los modelos
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# 2. Entrenamiento, predicción y evaluación de cada modelo
results = []
confusion_matrices = []
classification_reports = []

for model_name, model in models.items():
    print(f"Training model: {model_name}")
    
    # Entrenamiento
    model.fit(X_train_tfidf, y_train_sample)
    
    print(f"Infering model: {model_name}")
    # Predicción
    y_pred = model.predict(X_test_tfidf)
    
    # Evaluación de métricas
    accuracy = accuracy_score(y_test_sample, y_pred)
    precision = precision_score(y_test_sample, y_pred, average='weighted')
    recall = recall_score(y_test_sample, y_pred, average='weighted')
    f1 = f1_score(y_test_sample, y_pred, average='weighted')
    
    # Agregar resultados al DataFrame
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })

    # Generar y guardar la matriz de confusión
    cm = confusion_matrix(y_test_sample, y_pred)
    confusion_matrices.append({"Model": model_name, "Confusion Matrix": cm})
    
    # Generar y almacenar el classification report
    report = classification_report(y_test_sample, y_pred)
    classification_reports.append({"Model": model_name, "Classification Report": report})

# Convertir resultados a un DataFrame de pandas
results_df = pd.DataFrame(results)

# 3. Guardar resultados en un archivo CSV
results_df.to_csv(os.path.join(output_dir, 'model_comparison_results_1000.csv'), index=False)

# Guardar la matriz de confusión como un archivo CSV
confusion_matrices_df = pd.DataFrame(
    [(item['Model'], item['Confusion Matrix'].tolist()) for item in confusion_matrices],
    columns=['Model', 'Confusion Matrix']
)
confusion_matrices_df.to_csv(os.path.join(output_dir, 'confusion_matrices_1000.csv'), index=False)

# Guardar el classification report en un archivo de texto
with open(os.path.join(output_dir, 'classification_reports_1000.txt'), 'w') as file:
    for report in classification_reports:
        file.write(f"Model: {report['Model']}\n")
        file.write(f"{report['Classification Report']}\n")
        file.write("="*80 + "\n")

# 4. Mostrar resultados
print(results_df)


Training model: Logistic Regression
Infering model: Logistic Regression
Training model: Random Forest
Infering model: Random Forest
Training model: Support Vector Machine
Infering model: Support Vector Machine
Training model: Gradient Boosting
Infering model: Gradient Boosting
                    Model  Accuracy  Precision  Recall  F1-Score
0     Logistic Regression     0.832   0.826363   0.832  0.825193
1           Random Forest     0.869   0.866216   0.869  0.864983
2  Support Vector Machine     0.841   0.836299   0.841  0.836582
3       Gradient Boosting     0.857   0.853349   0.857  0.852615


In [8]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import os

# Crear el directorio de salida si no existe
output_dir = './outputs/detection/tfidf/'
os.makedirs(output_dir, exist_ok=True)

# 1. Definición de los modelos y sus hiperparámetros para GridSearchCV
model_params = {
    "Logistic Regression": {
        "model": LogisticRegression(random_state=42),
        "params": {
            "penalty": ['l1', 'l2', 'elasticnet', 'none'],
            "C": [0.01, 0.1, 1, 10],
            "solver": ['lbfgs', 'liblinear', 'saga']
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20, 30],
            "min_samples_split": [2, 5, 10]
        }
    },
    "Support Vector Machine": {
        "model": SVC(random_state=42),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "kernel": ['linear', 'rbf', 'poly'],
            "gamma": ['scale', 'auto']
        }
    },
    "Gradient Boosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "learning_rate": [0.01, 0.1, 1]
        }
    }
}

# 2. Grid Search CV para cada modelo
results = []
confusion_matrices = []
classification_reports = []

for model_name, mp in model_params.items():
    model = mp['model']
    params = mp['params']
    
    print(f"Starting Grid Search for: {model_name}")
    
    # Definición del GridSearchCV
    grid_search = GridSearchCV(model, params, cv=5, scoring='f1_weighted', verbose=1, n_jobs=-1)
    
    # Entrenamiento de GridSearchCV
    grid_search.fit(X_train_tfidf, y_train_sampled)
    
    # Mejor combinación de hiperparámetros
    best_model = grid_search.best_estimator_
    print(f"Best params for {model_name}: {grid_search.best_params_}")
    
    print(f"Infering with best model: {model_name}")
    # Predicción
    y_pred = best_model.predict(X_test_tfidf)
    
    # Evaluación de métricas
    accuracy = accuracy_score(y_test_sampled, y_pred)
    precision = precision_score(y_test_sampled, y_pred, average='weighted')
    recall = recall_score(y_test_sampled, y_pred, average='weighted')
    f1 = f1_score(y_test_sampled, y_pred, average='weighted')
    
    # Agregar resultados al DataFrame
    results.append({
        "Model": model_name,
        "Best Parameters": grid_search.best_params_,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    })

    # Generar y guardar la matriz de confusión
    cm = confusion_matrix(y_test_sampled, y_pred)
    confusion_matrices.append({"Model": model_name, "Confusion Matrix": cm})
    
    # Generar y almacenar el classification report
    report = classification_report(y_test_sampled, y_pred)
    classification_reports.append({"Model": model_name, "Classification Report": report})

# Convertir resultados a un DataFrame de pandas
results_df = pd.DataFrame(results)

# 3. Guardar resultados en un archivo CSV
results_df.to_csv(os.path.join(output_dir, 'model_comparison_results_12000_CV.csv'), index=False)

# Guardar la matriz de confusión como un archivo CSV
confusion_matrices_df = pd.DataFrame(
    [(item['Model'], item['Confusion Matrix'].tolist()) for item in confusion_matrices],
    columns=['Model', 'Confusion Matrix']
)
confusion_matrices_df.to_csv(os.path.join(output_dir, 'confusion_matrices_12000_CV.csv'), index=False)

# Guardar el classification report en un archivo de texto
with open(os.path.join(output_dir, 'classification_reports_12000_CV.txt'), 'w') as file:
    for report in classification_reports:
        file.write(f"Model: {report['Model']}\n")
        file.write(f"{report['Classification Report']}\n")
        file.write("="*80 + "\n")

# 4. Mostrar resultados
print(results_df)

Starting Grid Search for: Logistic Regression
Fitting 5 folds for each of 48 candidates, totalling 240 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best params for Logistic Regression: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
Infering with best model: Logistic Regression
Starting Grid Search for: Random Forest
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best params for Random Forest: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Infering with best model: Random Forest
Starting Grid Search for: Support Vector Machine
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best params for Support Vector Machine: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Infering with best model: Support Vector Machine
Starting Grid Search for: Gradient Boosting
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best params for Gradient Boosting: {'learning_rate': 0.1, 'n_estimators': 200}
Infering with best model: Gradient Boosting
                    Model                                    Best Parameters  \
0     Logistic Regression  {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}  