In [42]:
import os
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib


In [43]:
SEED = 42 
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

## WORD_TO_VECT con LR

In [44]:
# Configuración
dir_path = "../process/folds/"
dir_path_model = "../model"
glove_input_file = "glove.6B/glove.6B.100d.txt"
word2vec_output_file = "glove.6B/glove.6B.100d.word2vec.txt"

In [45]:
# Cargar modelo Word2Vec
w2v_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)
embedding_dim = w2v_model.vector_size

In [46]:
# Función para promediar vectores Word2Vec por documento
def document_vector(doc):
    words = doc.split()
    word_vecs = [w2v_model[word] for word in words if word in w2v_model]
    if len(word_vecs) == 0:
        return np.zeros(embedding_dim)
    return np.mean(word_vecs, axis=0)

In [47]:

# Codificador de etiquetas
label_encoder = LabelEncoder()

In [12]:

# Procesamiento por fold
for fold in range(10):
    print(f"\n*************************** Fold {fold}")

    # Cargar datos
    train_df = pd.read_csv(f"{dir_path}train_fold_{fold}.csv").fillna("")
    val_df = pd.read_csv(f"{dir_path}val_fold_{fold}.csv").fillna("")

    # Codificar etiquetas
    y_train = label_encoder.fit_transform(train_df["Level"])
    y_val = label_encoder.transform(val_df["Level"])

    # Representación vectorial con Word2Vec
    X_train = np.vstack(train_df["Clean_Content"].apply(document_vector).to_numpy())
    X_val = np.vstack(val_df["Clean_Content"].apply(document_vector).to_numpy())

    # Entrenar modelo
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)

    # Predicciones
    y_pred = model.predict(X_val)

    # Matriz de confusión
    cm = confusion_matrix(y_val, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Métricas
    report = classification_report(y_val, y_pred, target_names=label_encoder.classes_, output_dict=True)
    print("Classification Report:")
    print(pd.DataFrame(report).transpose())

    # Guardar modelo y encoder
    fold_dir = f"{dir_path_model}/fold_{fold}"
    os.makedirs(fold_dir, exist_ok=True)

    joblib.dump(model, f"{fold_dir}/word2vec_logreg_fold_{fold}.joblib")
    joblib.dump(label_encoder, f"{fold_dir}/word2vec_le_{fold}.joblib")

    # Guardar métricas
    os.makedirs(f"{dir_path_model}/results", exist_ok=True)
    pd.DataFrame(report).transpose().to_csv(f"{dir_path_model}/results/word2vec_report_fold_{fold}.csv")
    np.savetxt(f"{dir_path_model}/results/word2vec_cm_fold_{fold}.txt", cm, fmt='%d')
    #break


*************************** Fold 0
Confusion Matrix:
[[79  0  0]
 [ 1 75  3]
 [ 0  3 76]]
Classification Report:
              precision    recall  f1-score     support
Advertencia    0.987500  1.000000  0.993711   79.000000
Error          0.961538  0.949367  0.955414   79.000000
Informativo    0.962025  0.962025  0.962025   79.000000
accuracy       0.970464  0.970464  0.970464    0.970464
macro avg      0.970355  0.970464  0.970383  237.000000
weighted avg   0.970355  0.970464  0.970383  237.000000

*************************** Fold 1
Confusion Matrix:
[[80  0  0]
 [ 0 80  0]
 [ 0  6 74]]
Classification Report:
              precision  recall  f1-score  support
Advertencia    1.000000   1.000  1.000000   80.000
Error          0.930233   1.000  0.963855   80.000
Informativo    1.000000   0.925  0.961039   80.000
accuracy       0.975000   0.975  0.975000    0.975
macro avg      0.976744   0.975  0.974965  240.000
weighted avg   0.976744   0.975  0.974965  240.000

**********************

In [None]:
## Evaluar el model con conjunto de validación extern

In [None]:
dir_path = "../process/"
#logs_df = pd.read_csv(f"{dir_path}external_test_data.csv", encoding='utf-8')
train_df = pd.read_csv(f"{dir_path}train_data.csv", encoding='utf-8')
print(set(train_df["Level"]))
print(train_df.shape)
train_df=train_df.dropna(subset=['Clean_Content'])

print(train_df.shape)
train_df.head()

In [None]:
test_df = pd.read_csv(f"{dir_path}external_test_data.csv", encoding='utf-8')

print(set(test_df["Level"]))
print(test_df.shape)
test_df=test_df.dropna(subset=['Clean_Content'])

print(test_df.shape)
test_df.head()

# Evaluar el model con conjunto de validación extern

In [14]:
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from gensim.models import KeyedVectors

# Rutas
dir_path_model = "../model"
dir_path_ev = "../process/"
word2vec_path = "glove.6B/glove.6B.100d.word2vec.txt"


In [28]:
# Cargar modelo y encoder del fold 0
fold = 4
model = joblib.load(f"{dir_path_model}/fold_{fold}/word2vec_logreg_fold_{fold}.joblib")
label_encoder = joblib.load(f"{dir_path_model}/fold_{fold}/word2vec_le_{fold}.joblib")
w2v_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)
embedding_dim = w2v_model.vector_size

In [29]:
# Función para representar documento con promedio de Word2Vec
def document_vector(doc):
    words = doc.split()
    word_vecs = [w2v_model[word] for word in words if word in w2v_model]
    if len(word_vecs) == 0:
        return np.zeros(embedding_dim)
    return np.mean(word_vecs, axis=0)

In [30]:
# Cargar dataset externo
ev_df = pd.read_csv(f"{dir_path_ev}external_test_data.csv").fillna("")
print(ev_df.shape)
ev_df.head()

(320, 6)


Unnamed: 0,Content,Level,datetime,content_length,source,Clean_Content
0,"Animating brightness: target=38, rate=200",Informativo,2024-03-17 16:15:58.884,4,Android,animating brightness target rate
1,getRunningAppProcesses: caller 10111 does not ...,Advertencia,2024-03-17 16:15:22.152,6,Android,getrunningappprocesses caller does hold limiti...
2,isSimPinSecure mSimDatas is null or empty,Error,2024-03-17 16:13:46.764,3,Android,issimpinsecure msimdatas null
3,jk2_init() Found child 8765 in scoreboard slot 11,Informativo,2005-12-04 05:04:03.000,3,Apache,child scoreboard slot
4,workerEnv.init() ok /etc/httpd/conf/workers2.p...,Informativo,2005-12-04 17:34:57.000,6,Apache,workerenv init ok httpd conf properties


In [31]:
# Vectorizar textos
X_ev = np.vstack(ev_df["Clean_Content"].apply(document_vector).to_numpy())
y_ev = ev_df["Level"]
y_true = label_encoder.transform(y_ev)

In [32]:

# Predicciones
y_pred = model.predict(X_ev)
y_pred_labels = label_encoder.inverse_transform(y_pred)


In [33]:
# Agregar predicciones al dataframe
ev_df["Predicted_Level"] = y_pred_labels

In [34]:
# Calcular métricas
report_dict = classification_report(
    y_true, 
    y_pred, 
    target_names=label_encoder.classes_, 
)
print(report_dict)


              precision    recall  f1-score   support

 Advertencia       1.00      0.95      0.98        44
       Error       0.97      0.99      0.98       138
 Informativo       0.99      0.99      0.99       138

    accuracy                           0.98       320
   macro avg       0.99      0.98      0.98       320
weighted avg       0.98      0.98      0.98       320



In [35]:
accuracy = accuracy_score(y_true, y_pred)
accuracy

0.98125

In [36]:
conf_matrix = confusion_matrix(y_true, y_pred)
conf_matrix

array([[ 42,   2,   0],
       [  0, 136,   2],
       [  0,   2, 136]])

In [37]:
ev_df["Predicted_Level"] = y_pred_labels
ev_df.head()

Unnamed: 0,Content,Level,datetime,content_length,source,Clean_Content,Predicted_Level
0,"Animating brightness: target=38, rate=200",Informativo,2024-03-17 16:15:58.884,4,Android,animating brightness target rate,Informativo
1,getRunningAppProcesses: caller 10111 does not ...,Advertencia,2024-03-17 16:15:22.152,6,Android,getrunningappprocesses caller does hold limiti...,Advertencia
2,isSimPinSecure mSimDatas is null or empty,Error,2024-03-17 16:13:46.764,3,Android,issimpinsecure msimdatas null,Error
3,jk2_init() Found child 8765 in scoreboard slot 11,Informativo,2005-12-04 05:04:03.000,3,Apache,child scoreboard slot,Informativo
4,workerEnv.init() ok /etc/httpd/conf/workers2.p...,Informativo,2005-12-04 17:34:57.000,6,Apache,workerenv init ok httpd conf properties,Informativo


In [38]:
dir_path = "../process/"
# Guardar en un archivo CSV
ev_df.to_csv(f'{dir_path}5_10external_test_data_word2vect_prediction.csv', index=False)