In [None]:
import mlflow
import os
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

penguins = pd.read_csv("../data/penguins.csv")

# Drop missing values
penguins.dropna(inplace=True)

# Preprocess the data
le = preprocessing.LabelEncoder()
y = le.fit_transform(penguins['species'])
X = penguins[['bill_length_mm', 'bill_depth_mm']]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Set tracking uri
experiment_name = "PENGUINS"
mlflow.set_tracking_uri('http://localhost:5000/')
mlflow.set_experiment(experiment_name)

# Define the models
models = {
    "Logistic Regression": {
        "model": LogisticRegression,
        "params": [{"max_iter": 1000}, {"max_iter": 2000}]
    },
    "Decision Tree": {
        "model": DecisionTreeClassifier,
        "params": [{"max_depth": 5}, {"max_depth": 10}]
    },
    "K-Nearest Neighbors": {
        "model": KNeighborsClassifier,
        "params": [{"n_neighbors": 5}, {"n_neighbors": 10}]
    },
}

# Loop over models
for model_name, model_info in models.items():
    Model = model_info["model"]
    for params in model_info["params"]:
        with mlflow.start_run():
            # Instantiate and train the model
            model = Model(**params)
            model.fit(X_train, y_train)
            
            # Predict and calculate accuracy
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            
            # Log the model name, parameters and accuracy
            mlflow.log_param("model_name", model_name)
            mlflow.log_params(params)
            mlflow.log_metric("accuracy", accuracy)

            # Log the sklearn model
            mlflow.sklearn.log_model(model, "model", registered_model_name=experiment_name)

            # Plot the data
            fig, ax = plt.subplots()
            predictions = model.predict(X)
            scatter = ax.scatter(X['bill_length_mm'], X['bill_depth_mm'], c=predictions)
            legend1 = ax.legend(*scatter.legend_elements(), title="Species")
            ax.add_artist(legend1)
            plt.xlabel('bill_length_mm')
            plt.ylabel('bill_depth_mm')
            plot_path = f"{model_name}_{params}.png"
            plt.savefig(plot_path)
            
            # Log artifact (scatter plot)
            mlflow.log_artifact(plot_path)
            
            if os.path.exists(plot_path):
                os.remove(plot_path)
            else:
                print("The file does not exist")
