In [37]:
import pandas as pd 
import pickle
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
import configparser

In [38]:
with open('../artifacts/pipeline.pkl', 'rb') as f:
    smartgrid_pipeline = pickle.load(f)

In [39]:
train_data = pd.read_csv('../data/processed/feautures_for_models.csv')
test_data = pd.read_csv('../data/processed/test_dataset.csv')

In [40]:
config = configparser.ConfigParser()
config.read('../pipeline.cfg')

['../pipeline.cfg']

In [41]:
x_features = train_data.drop(labels=list(config.get('GENERAL', 'VARS_TO_DROP').split(', ')), axis=1)
y_target = train_data[config.get('GENERAL', 'TARGET')]

x_features_test = test_data.drop(labels=list(config.get('GENERAL', 'VARS_TO_DROP').split(', ')), axis=1)
y_target_test = test_data[config.get('GENERAL', 'TARGET')]

In [42]:
test_data

Unnamed: 0,tau1,tau2,tau3,tau4,p1,p2,p3,p4,g1,g2,g3,g4,stab,stabf
0,-0.157188,0.066192,0.721890,0.043371,0.387322,0.245099,-0.386437,-0.530197,1.444940,1.714924,-1.041051,0.353132,1.961919,1
1,1.201832,-0.202714,0.495467,0.599151,-0.353089,-0.012173,1.594401,-0.972739,0.222884,-0.109862,-0.972055,-0.753512,0.422430,1
2,-0.391252,0.946300,0.362816,-1.209971,0.243255,-0.313624,-1.164388,1.058616,0.357443,-1.314052,-0.508506,-1.385353,-0.892918,0
3,-0.190846,-0.663873,0.055758,1.575784,-0.275177,1.135147,0.279579,-0.939151,-0.003094,0.029737,0.891238,-1.184966,0.465899,1
4,-0.045653,0.065259,0.127840,1.167994,-1.044114,1.619889,-0.198622,0.388812,0.966234,1.013356,-0.036158,0.341200,1.762654,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,-1.016666,0.777253,1.301505,-0.599454,0.100448,0.084444,0.096073,-0.355083,0.492998,0.193795,0.308089,0.603722,0.442180,1
11996,1.155370,-0.181195,1.518602,-0.197318,0.493696,-1.102410,-1.109712,1.359010,0.496960,-0.119861,0.831375,-0.428824,1.121445,1
11997,-0.654154,0.072829,0.672756,1.154029,-0.423355,0.560549,1.660031,-1.489969,0.008912,-0.839488,-0.695876,-0.641998,-0.271962,1
11998,-1.135765,1.574695,1.354067,0.698996,0.376050,-1.585966,1.271752,-0.338913,-0.427872,0.300840,-0.437681,-1.060964,0.023514,1


In [43]:
import mlflow # importamos mlflow

In [44]:
# Configuración de conexión
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment('Smart Grid Predict Model')

<Experiment: artifact_location='mlflow-artifacts:/924688578835200205', creation_time=1734486810583, experiment_id='924688578835200205', last_update_time=1734486810583, lifecycle_stage='active', name='Smart Grid Predict Model', tags={}>

In [45]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score

with mlflow.start_run():
    modelos = {
        # KNN models
        'KNN_default': KNeighborsClassifier(),
        'KNN_optimized': KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto'),
        'KNN_optimized2': KNeighborsClassifier(n_neighbors=10, weights='uniform', algorithm='ball_tree'),
        
        # DecisionTree models
        'DecisionTree_default': DecisionTreeClassifier(),
        'DecisionTree_optimized': DecisionTreeClassifier(max_depth=10, min_samples_split=5),
        
        # RandomForest models
        'RandomForest_default': RandomForestClassifier(),
        'RandomForest_optimized': RandomForestClassifier(n_estimators=150, max_depth=20),
        
        # GradientBoosting models
        'GradientBoosting_default': GradientBoostingClassifier(),
        'GradientBoosting_optimized': GradientBoostingClassifier(n_estimators=200, learning_rate=0.05),
        
        # LogisticRegression models
        'LogisticRegression_default': LogisticRegression(),
        'LogisticRegression_optimized': LogisticRegression(C=0.5, max_iter=1000),
        'LogisticRegression_optimized2': LogisticRegression(penalty='l2', C=1.0, solver='liblinear'),
        
        # XGBoost models
        'XGBoost_default': XGBClassifier(),
        'XGBoost_optimized': XGBClassifier(n_estimators=300, learning_rate=0.1),
        'XGBoost_optimized2': XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.05)
    }

    resultados = {}
    for nombre, modelo in modelos.items():
        modelo.fit(x_features, y_target)
        y_preds = modelo.predict(x_features_test)
        acc = accuracy_score(y_target_test, y_preds)
        resultados[nombre] = acc
        print(f'{nombre} Accuracy: {acc}')

        mlflow.log_metric(f'{nombre}_accuracy', acc)  # Log the accuracy with a name

        mlflow.sklearn.log_model(modelo, f'{nombre}_model')  # Log the model with its name
    
    mlflow.end_run()  # End the run after all models are logged



KNN_default Accuracy: 0.95275




KNN_optimized Accuracy: 0.9535833333333333
🏃 View run painted-bird-322 at: http://127.0.0.1:5000/#/experiments/924688578835200205/runs/f86db56d236d46dd8b8b4eecc1f1323d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/924688578835200205


KeyboardInterrupt: 

In [19]:
mejor_modelo_nombre = min(resultados, key=resultados.get)
mejor_modelo = modelos[mejor_modelo_nombre]

In [20]:
smartgrid_pipeline.steps.append((f'modelo_{mejor_modelo_nombre}', mejor_modelo))

In [21]:
with open('../artifacts/pipeline_final.pkl', 'wb') as f:
    pickle.dump(smartgrid_pipeline, f)

In [22]:
smartgrid_pipeline