In [64]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import random
import os
import warnings
warnings.filterwarnings('ignore')

In [65]:
SEED = 42 
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [66]:
dir_path_model = "../model/"

## Cargar datos

In [67]:
dir_path = "../process/folds/"
# Codificador de etiquetas
label_encoder = LabelEncoder()

In [68]:
for fold in range(10):
    
    print(f"***************************\ Fold {fold}")
    file= f""#f"train_fold_{fold_N}"#
    # Cargar datos
    train_df = pd.read_csv(f"{dir_path}train_fold_{fold}.csv").fillna("")
    val_df = pd.read_csv(f"{dir_path}val_fold_{fold}.csv").fillna("")
    y_train = label_encoder.fit_transform(train_df["Level"])
    y_val = label_encoder.transform(val_df["Level"])    

    # Codificar etiquetas

    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train = vectorizer.fit_transform(train_df["Clean_Content"])
    X_val = vectorizer.transform(val_df["Clean_Content"])

    # Modelo
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Predicciones
    y_pred = model.predict(X_val)
    
    # Matriz de confusión
    cm = confusion_matrix(y_val, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Métricas
    report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)
    print("Classification Report:")
    print(pd.DataFrame(report).transpose())

    # Guardar modelo y vectorizador
    joblib.dump(model, f"{dir_path_model}/fold_{fold}/tfidf_logreg_fold_{fold}.joblib")
    joblib.dump(vectorizer, f"{dir_path_model}/fold_{fold}/tfidf_vectorizer_fold_{fold}.joblib")
    joblib.dump(label_encoder, f"{dir_path_model}/fold_{fold}/tfidf_le_{fold}.joblib")
    
    # Guardar métricas
    pd.DataFrame(report).transpose().to_csv(f"{dir_path_model}/results/tfidf_report_fold_{fold}.csv")
    np.savetxt(f"{dir_path_model}/results/cm_fold_{fold}.txt", cm, fmt='%d')
    
    #break

***************************\ Fold 0
Confusion Matrix:
[[79  0  0]
 [ 0 74  5]
 [ 0  1 78]]
Classification Report:
              precision    recall  f1-score     support
Advertencia    1.000000  1.000000  1.000000   79.000000
Error          0.986667  0.936709  0.961039   79.000000
Informativo    0.939759  0.987342  0.962963   79.000000
accuracy       0.974684  0.974684  0.974684    0.974684
macro avg      0.975475  0.974684  0.974667  237.000000
weighted avg   0.975475  0.974684  0.974667  237.000000
***************************\ Fold 1
Confusion Matrix:
[[79  0  1]
 [ 0 79  1]
 [ 0  0 80]]
Classification Report:
              precision    recall  f1-score     support
Advertencia    1.000000  0.987500  0.993711   80.000000
Error          1.000000  0.987500  0.993711   80.000000
Informativo    0.975610  1.000000  0.987654   80.000000
accuracy       0.991667  0.991667  0.991667    0.991667
macro avg      0.991870  0.991667  0.991692  240.000000
weighted avg   0.991870  0.991667  0.991692 

## Evaluar el model con conjunto de validación externa

In [69]:
import joblib
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score

# Rutas
fold = 0
dir_path_model = "../model/"


In [70]:
model_path = f"{dir_path_model}/fold_{fold}/tfidf_logreg_fold_{fold}.joblib"
vectorizer_path = f"{dir_path_model}/fold_{fold}/tfidf_vectorizer_fold_{fold}.joblib"
le_path = f"{dir_path_model}/fold_{fold}/tfidf_le_{fold}.joblib"
# Cargar modelo y vectorizador
model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)
label_encoder = joblib.load(le_path)


In [71]:
dir_path_ev = "../process/"

ev_df = pd.read_csv(f"{dir_path_ev}external_test_data.csv").fillna("")
print(ev_df.shape)
ev_df.head()

(320, 6)


Unnamed: 0,Content,Level,datetime,content_length,source,Clean_Content
0,"Animating brightness: target=38, rate=200",Informativo,2024-03-17 16:15:58.884,4,Android,animating brightness target rate
1,getRunningAppProcesses: caller 10111 does not ...,Advertencia,2024-03-17 16:15:22.152,6,Android,getrunningappprocesses caller does hold limiti...
2,isSimPinSecure mSimDatas is null or empty,Error,2024-03-17 16:13:46.764,3,Android,issimpinsecure msimdatas null
3,jk2_init() Found child 8765 in scoreboard slot 11,Informativo,2005-12-04 05:04:03.000,3,Apache,child scoreboard slot
4,workerEnv.init() ok /etc/httpd/conf/workers2.p...,Informativo,2005-12-04 17:34:57.000,6,Apache,workerenv init ok httpd conf properties


In [72]:
X_ev = ev_df["Clean_Content"]  
y_ev = ev_df["Level"] 

In [73]:
# Transformar texto con el vectorizador
X_ev_new = vectorizer.transform(X_ev)

In [74]:
y_pred = model.predict(X_ev_new)
y_pred_labels = label_encoder.inverse_transform(y_pred)


In [75]:
# Transformar etiquetas reales
y_true = label_encoder.transform(y_ev)

In [76]:
# Calcular métricas
report_dict = classification_report(
    y_true, 
    y_pred, 
    target_names=label_encoder.classes_, 
)
print(report_dict)


              precision    recall  f1-score   support

 Advertencia       0.98      0.95      0.97        44
       Error       0.99      0.99      0.99       138
 Informativo       0.99      0.99      0.99       138

    accuracy                           0.98       320
   macro avg       0.98      0.98      0.98       320
weighted avg       0.98      0.98      0.98       320



In [77]:
accuracy = accuracy_score(y_true, y_pred)
accuracy

0.984375

In [78]:
ev_df["Predicted_Level"] = y_pred_labels
ev_df.head()

Unnamed: 0,Content,Level,datetime,content_length,source,Clean_Content,Predicted_Level
0,"Animating brightness: target=38, rate=200",Informativo,2024-03-17 16:15:58.884,4,Android,animating brightness target rate,Informativo
1,getRunningAppProcesses: caller 10111 does not ...,Advertencia,2024-03-17 16:15:22.152,6,Android,getrunningappprocesses caller does hold limiti...,Advertencia
2,isSimPinSecure mSimDatas is null or empty,Error,2024-03-17 16:13:46.764,3,Android,issimpinsecure msimdatas null,Informativo
3,jk2_init() Found child 8765 in scoreboard slot 11,Informativo,2005-12-04 05:04:03.000,3,Apache,child scoreboard slot,Informativo
4,workerEnv.init() ok /etc/httpd/conf/workers2.p...,Informativo,2005-12-04 17:34:57.000,6,Apache,workerenv init ok httpd conf properties,Informativo


In [79]:
dir_path = "../process/"
# Guardar en un archivo CSV
ev_df.to_csv(f'{dir_path}4_10external_test_data_tdidf_prediction.csv', index=False)