In [None]:
! pip install mlflow

# MLFlow

It can be very hard to keep track of all of the different machine learning experiments on ran. Luckily we have mlflow too keep track of the configurations and the results accociated with it.

In [None]:
import mlflow
import pandas as pd
df = pd.read_csv('train.csv')
df = df.fillna(0)
features = ["Pclass", "Age", "SibSp", "Parch", "Fare"]
target_column = ["Survived"]

In [None]:
from typing import Dict
import operator

def get_family_id(row: pd.Series, family_id_map: Dict[str, int]) -> int:
    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_map:
        if len(family_id_map) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if we don't have an id
            current_id = (max(family_id_map.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_map[family_id] = current_id
    return family_id_map[family_id]

famile_id_map = {}
df["FamilySize"] = df["SibSp"] + df["Parch"]
df["FamilyId"] = df.apply(get_family_id, args=(famile_id_map,), axis=1)
df

In [None]:
from mlflow.entities import Run, Experiment
import json
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import mlflow.sklearn

mlflow.set_experiment("Titanic")

df = df.fillna(0)

xTrain, xTest, yTrain, yTest = train_test_split(
    df[features],
    df["Survived"],
    stratify = df["Survived"],
    random_state = 42
)


with mlflow.start_run() as run:
    max_depth = 3
    max_features = 4
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("max_features", max_features)
    decision_tree = DecisionTreeClassifier(
        max_depth=max_depth,
        max_features=max_features
    )
    decision_tree.fit(df[features], df[target_column])
    
    train_accuracy = decision_tree.score(xTrain, yTrain)
    mlflow.log_metric("train_accuracy", train_accuracy)
    
    test_accuracy = decision_tree.score(xTest, yTest)
    mlflow.log_metric("test_accuracy", test_accuracy)
    with open("family_id_map.json", "w") as f:
        json.dump(famile_id_map, f)
    mlflow.log_artifact("family_id_map.json")
    mlflow.sklearn.log_model(decision_tree, "model")



In [None]:
from mlflow.tracking import MlflowClient
client = MlflowClient()
client.list_experiments()

In [None]:
client.list_run_infos("1")