# API WORKFLOW: SKLEARN LOGISTIC REGRESSION

## Codigo base

In [1]:
import pandas as pd

#Funcion de carga, filtrado en funcion de la logica del negocio y asignacion de class segun ofertas aceptadas
def load_data():

    import pandas as pd

    df = pd.read_csv("2023-09-15_duplicate_2.csv")

    df = df[df["last_seen"]>= '2022-01-01']
    df.loc[df['accepted'] > 0.0, 'classification'] = 1
    df.loc[df['accepted'] == 0.0, 'classification'] = 0
    df['classification'] = df['classification'].astype('int32')


    return df


#Funcion para optimizar el desequilibrio de datos, filtrando una porcion de los non_interaction con la totalidad de accepted
def balance_data():

    df = load_data()

    non_interaction_quantity = len(df[df.classification == 0])
    accepted_quantity = len(df[df.classification == 1])

    non_interaction_data = df[df["classification"] == 0]
    accepted_data = df[df["classification"] == 1]

    non_interaction_factor = 0.9 
    accepted_factor = 1 - non_interaction_factor

    non_interaction_quantity = int(accepted_quantity * non_interaction_factor /accepted_factor )

    non_interaction_partion = non_interaction_data.sample(n=non_interaction_quantity)
    balance_df = pd.concat([non_interaction_partion, accepted_data])

    
    return balance_df

#Funcion para definir los datos que entran en el modelo 
def var_definition():
    
    balance_df = balance_data()

    from sklearn.preprocessing import MinMaxScaler
    from sklearn.feature_selection import SelectKBest, chi2, f_classif

    y = balance_df["classification"].to_numpy()
    x = balance_df.drop(["classification","last_login","last_seen","validation_state","town","state"], axis = 1).to_numpy()

    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(x)
    x_normal = scaler.transform(x)

    selectKBest = SelectKBest(chi2, k=2) 
    x = selectKBest.fit_transform(x_normal, y)

    cols = selectKBest.get_support(indices=True)
    variables = balance_df.iloc[:,cols].columns

    x_new = pd.DataFrame(x, columns = variables)
    y_new = pd.DataFrame(y, columns = ["classification"])

    print(f"For chi2 test, choosen variables are {str(list(variables))}")


    return x_new, y_new

#Función para particionar los datos
def make_train_test_split(x_new, y_new):

    from sklearn.model_selection import train_test_split

    (x_train, x_test, y_train, y_test) = train_test_split(
                                                          x_new,
                                                          y_new,
                                                          test_size=0.2,
                                                          random_state=123456,
                                                          shuffle = True
                                                      )

    return x_train, x_test, y_train, y_test


#Calculo de metricas de evaluacion
def eval_metrics(y_test, y_pred):

    from sklearn.metrics import mean_squared_error,accuracy_score, recall_score, precision_score,balanced_accuracy_score,f1_score,fbeta_score,classification_report
    from sklearn.metrics import auc, roc_auc_score, roc_curve
    
    asc = accuracy_score(
                          # Ground truth (correct) labels.
                          y_true=y_test,
                          # Predicted labels, as returned by a classifier.
                          y_pred=y_pred,
                          # If False, return the number of correctly classified samples. Otherwise,return the fraction of correctly classified samples.
                          normalize=True,
                          )
    bas = balanced_accuracy_score(y_test,y_pred)
    rs = recall_score(y_test, y_pred)
    ps = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    ras = roc_auc_score(y_test,y_pred)
    
    

    return bas, asc, rs, ps, f1, ras

#Enviamos reporte de metricas de evaluacion
def report(estimator,bas, asc, rs, ps, f1, ras):

    print(estimator, ":", sep="")
    print("------------------------------------")
    print(f"El accuracy score es {str(round(asc,4))}")
    print(f"El accuracy balanced score es {str(round(bas,4))}")
    print(f"El recall score es {str(round(rs,4))}")
    print(f"El precision score es {str(round(ps,4))}")
    print(f"El f1_score es {str(round(f1,4))}")
    print(f"El área bajo la curva ROC es {str(round(ras,4))}")
    

    #Tracking URI
def set_tracking_uri():

    import mlflow

    mlflow.set_tracking_uri('sqlite:///mlruns.db')


    #Display config 
def display_config():

    import mlflow

    print("Current model registry uri: {}".format(mlflow.get_registry_uri()))
    print("      Current tracking uri: {}".format(mlflow.get_tracking_uri()))


    #Modelo
def train_estimator( C=10, fit_intercept=False, max_iter=500):

    import os
    import pandas as pd

    from sklearn.linear_model import LogisticRegression
    

    import mlflow
    import mlflow.sklearn
    

    """Load and split data"""
    x_new, y_new = var_definition()
    x_train, x_test, y_train, y_test = make_train_test_split(x_new, y_new)
    
    estimator = LogisticRegression(C=C, fit_intercept=fit_intercept, max_iter=max_iter ,
                                       penalty = 'l2',multi_class='ovr', solver= 'lbfgs',tol = 0.001,random_state = None, verbose = 0, warm_start = False, l1_ratio = None)
    
    set_tracking_uri()
    
    with mlflow.start_run(run_name="DEMO_RUN") as run:
        
        print(f"MLflow run ID: {run.info.run_id}")
        
        estimator.fit(x_train, y_train.values.ravel())
        bas, asc, rs, ps, f1, ras = eval_metrics(y_test, y_pred=estimator.predict(x_test))
        
        report(estimator,bas, asc, rs, ps, f1, ras)
    
        
        mlflow.log_param("C", C)
        mlflow.log_param("fit_intercept",fit_intercept )
        mlflow.log_param("max_iter", max_iter)
       
        mlflow.log_metric("bas", bas)
        mlflow.log_metric("asc", asc)
        mlflow.log_metric("rs", rs)
        mlflow.log_metric("ps", ps)
        mlflow.log_metric("f1", f1)
        mlflow.log_metric("ras", ras)

        
# Registro del modelo como version 1
    
        mlflow.sklearn.log_model(
            sk_model=estimator,
            artifact_path="model",
            registered_model_name=f"sklearn-{C}-logistic-regressor-model"
        )

        mlflow.sklearn.save_model(estimator, "/tmp/my_model")

train_estimator( C = 10, fit_intercept = False, max_iter = 500)

For chi2 test, choosen variables are ['item_post', 'swap_offers_sent']
MLflow run ID: 05785e0a5d554e11ab9023874c5dcd1d
LogisticRegression(C=10, fit_intercept=False, max_iter=500, multi_class='ovr',
                   tol=0.001):
------------------------------------
El accuracy score es 0.9485
El accuracy balanced score es 0.853
El recall score es 0.733
El precision score es 0.7557
El f1_score es 0.7442
El área bajo la curva ROC es 0.853


Registered model 'sklearn-10-logistic-regressor-model' already exists. Creating a new version of this model...
2023/03/26 01:29:45 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-10-logistic-regressor-model, version 3
Created version '3' of model 'sklearn-10-logistic-regressor-model'.


## Cambio del estado a Productivo del modelo

In [4]:
def change_name():

    import mlflow

    client = mlflow.tracking.MlflowClient()

    client.transition_model_version_stage(
        name="sklearn-10-logistic-regressor-model",
        version=1,
        #  Staging|Archived|Production|None
        stage="Production",
    )


change_name()

## Uso del modelo con models serve

In [5]:
def get_json_test_data():

    x_new, y_new = var_definition()
    x_train, x_test, y_train, y_test = make_train_test_split(x_new, y_new)

    data = x_test.iloc[0:200,:].to_json(orient='split')

    data = repr(data)
    return data

data = get_json_test_data()
data

For chi2 test, choosen variables are ['item_post', 'swap_offers_sent']


'\'{"columns":["item_post","swap_offers_sent"],"index":[5938,7979,33308,9196,25758,20251,453,1494,32611,10066,21646,1845,18304,12295,6004,25151,6768,19036,6814,36691,15902,31961,24838,35530,16483,12480,20627,1832,3074,17745,4481,11679,34385,19588,7210,26391,6291,28995,20804,31839,10202,22803,18441,25722,17935,34610,28483,12900,33387,35273,36345,35349,29578,35392,13132,9095,26274,28424,16917,7320,33651,4999,28589,3447,29675,21881,19864,24094,13956,8089,36186,26468,22044,34903,11544,9471,8593,19465,16776,8594,27957,27576,25374,14294,29333,11530,3849,26343,21205,7369,135,29743,37045,35003,31270,11998,31928,31240,30725,919,9123,28566,25570,3467,23761,34012,24267,26622,24087,35680,9629,14419,12539,112,5593,23273,15012,24786,4001,11558,815,15590,26306,1520,27017,29400,167,14399,11170,8441,27939,22239,24099,24085,4212,9455,5048,29611,7768,27080,36753,12237,3431,26092,9334,3482,11405,29149,27154,36369,14428,9116,34318,34729,11122,1968,27959,32539,18902,21115,25264,17662,321,21109,22041,21097,2

### Ahora prendemos el servidor de MLflow que sirve el API, para poder hacer el llamado al modelo

```
mlflow models serve --no-conda -m /tmp/my_model
```

In [10]:
!curl http://127.0.0.1:5000/invocations -H 'Content-Type: application/json' -d {data}

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0]