In [1]:
import mlflow
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
import logging


logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [None]:
mlflow.set_tracking_uri("sqlite:///backend.db")
mlflow.set_experiment("Experimento_3")

with mlflow.start_run(run_name = "example_1"):

    X,y = load_iris(return_X_y= True)
    params = {"C": 0.1, "random_state": 42}
    mlflow.log_params(params)

    lr = LogisticRegression(**params).fit(X, y)
    y_pred = lr.predict(X)
    mlflow.log_metric("accuracy", accuracy_score(y, y_pred))
    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

### Probaremos que hacen las funciones que hay en el repositorio de Mlops

In [56]:
import pandas as pd
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split
import joblib
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, 
                             ConfusionMatrixDisplay, precision_recall_fscore_support, 
                             precision_score, recall_score, roc_auc_score)
import logging

# Configura el logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



In [30]:
def read_data(file_name: str, path="D:/Repositorio/Proyecto-titanic/Notebooks") -> pd.DataFrame:
    return pd.read_csv(os.path.join(path, file_name))

In [34]:
tickets = read_data("tickets_inputs_eng_1.csv")
tickets.head()

Unnamed: 0,complaint_what_happened,ticket_classification,processed_text,relevant_topics
0,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt,morning nam bank cardmemb servic i debt verifi...,Mortgage/Loan
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...,i card agent upgrad dat agent information orde...,Mortgage/Loan
2,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...",card report howev application hav identity con...,Mortgage/Loan
3,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...",book ticket offer ticket card i information of...,Mortgage/Loan
4,my grand son give me check for {$1600.00} i de...,Checking or savings account + Checking account,giv check deposit chas account fund cle bank c...,Bank Account Services


In [36]:
def data_transform(data:pd.DataFrame):
    X = tickets["processed_text"]
    y = tickets["relevant_topics"]
    return X, y

In [6]:
def one_hot_encoding2(data, columns):
    encoded_data = pd.get_dummies(data, columns=columns, drop_first=True)
    return encoded_data.astype(int)


In [7]:
one_hot_encoding2(X, ["Sex"])

Unnamed: 0,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Sex_male
0,3,22,1,0,7,1
1,1,38,1,0,71,0
2,3,26,0,0,7,0
3,1,35,1,0,53,0
4,3,35,0,0,8,1
...,...,...,...,...,...,...
882,2,27,0,0,13,1
883,1,19,0,0,30,0
884,3,7,1,2,23,0
885,1,26,0,0,30,1


In [8]:
def read_idx2label(json_path: str) -> pd.Series:
    """This function read the json file and return a dictionary
    Args:
      json_path (str): path to the json file
     Returns:
      idx2label (dict): dictionary with the mapping"""
    with open(json_path) as f:
        idx2label = json.load(f)
    return idx2label

In [11]:
def decode_labels_into_idx(labels: pd.Series, idx2label: dict) -> pd.Series:
    """This function decode the labels into idx
    Args:
      labels (pd.Series): series with the labels
      idx2label (dict): dictionary with the mapping
     Returns:
      labels (pd.Series): series with the labels decoded
    """
    return labels.map(idx2label)

In [69]:
def fit_transform(X: pd.Series) -> np.ndarray:
    count_vectorizer = CountVectorizer()
    X_vectorized = count_vectorizer.fit_transform(X)
    #save count vectorizer for data preprocessing in the main app (deploy)
    joblib.dump(count_vectorizer, 'D:/Repositorio/Proyecto-titanic/Notebooks/data_processed/count_vectorizer.pkl')
    logger.info("count vectorizer trained successfully stored")
    return X_vectorized


def transform_tfidf(X_vectorized: object) -> np.ndarray:
    tfidf_transformer = TfidfTransformer()
    X_tfidf = tfidf_transformer.fit_transform(X_vectorized)
    joblib.dump(X_tfidf, 'D:/Repositorio/Proyecto-titanic/Notebooks/data_processed/X_tfidf.pkl')
    logger.info("X_tfidf trained successfully stored")
    return X_tfidf

def save_pickle(data, filename) -> None:
    """
    This function saves the data in a pickle file
    Args:
        data (object): data to save
        filename (str): filename
    Returns:
        None
    """
    filepath = os.path.join("D:/Repositorio/Proyecto-titanic/Notebooks", "data_processed", f"{filename}.pkl")
    with open(filepath, 'wb') as file:
        pickle.dump(data, file)

def split_train_test(
    X_tfidf: np.array, y: pd.Series, test_size: float = 0.3, random_state: int = 42
) -> tuple:
    """
    This function splits the data into train and test
    Args:
      X_tfidf (np.array): array with the vectorized data
      y (pd.Series): series with the labels
      test_size (float): test size
      random_state (int): random state
    Returns:
      X_train (np.array): array with the vectorized data for train
      X_test (np.array): array with the vectorized data for test
      y_train (pd.Series): series with the labels for train
      y_test (pd.Series): series with the labels for test
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X_tfidf, y, test_size=test_size, random_state=random_state
    )
    save_pickle((X_train, y_train), "train")
    save_pickle((X_test, y_test),  "test")
    logger.info("data saved successfully in pickle files")
    return X_train, X_test, y_train, y_test

In [49]:
data_json= read_idx2label(json_path="topic_mapping_1.json")
label_tikets = {value: key for key, value in data_json.items()}
X, y = data_transform(tickets)
y = decode_labels_into_idx(labels=y, idx2label=label_tikets)

In [58]:
X_vectorized = fit_transform(X.values)

INFO:__main__:count vectorizer trained successfully stored


In [61]:
X_tfidf = transform_tfidf(X_vectorized)

INFO:__main__:X_tfidf trained successfully stored


In [70]:
X_train, X_test, y_train, y_test = split_train_test(X_tfidf, y)

INFO:__main__:data saved successfully in pickle files
