# 0.0 Imports

In [None]:
from sklearn import linear_model
from sklearn import metrics
import mlflow
import pandas as pd
import plotly.express as px

# 1.0 Load dataset

In [None]:
df = pd.read_csv("data/estudante_escola.csv")
assert df.isna().sum().sum() == 0

df.info()

## 1.1. Convert columns

In [None]:
df["estudante"] = df["estudante"].astype(str)
df["escola"] = df["escola"].astype(str)
df.dtypes

# 2.0 Feature engineering

In [None]:
df.drop(["estudante", "escola"], axis=1).corr().style.background_gradient(
    cmap="coolwarm"
)

# 3.0 Training

## 3.1. Connect to MLFlow

In [None]:
# Connecting
mlflow_uri = "http://127.0.0.1:8005/"
mlflow.set_tracking_uri(mlflow_uri)

# Create/set experiment
experiment_name = "workshop_ambiental"
experiment = mlflow.set_experiment(experiment_name)

In [None]:
df.columns

## 3.2. Simple training

In [None]:
y_column = ["desempenho"]
X_columns = ["texp", "horas"]

y_variable = df[y_column]
X_variables = df[X_columns]

# Training
lr_model = linear_model.LinearRegression()
lr_model.fit(X=X_variables, y=y_variable)

# Evaluating
df["predict"] = lr_model.predict(X_variables)
y_pred = df["predict"]

## Metrics
explained_variance = metrics.explained_variance_score(y_variable, y_pred)
mse = metrics.mean_absolute_error(y_variable, y_pred)
r2 = metrics.r2_score(y_variable, y_pred)

print("expained variance:", explained_variance)
print("MSE:", mse)
print("R2:", r2)

# Visualization
px.scatter(
    df, y_column, "predict", "escola", title=f"y ~ {X_columns}: MSE = {mse}; R2 = {r2}"
)

## 3.3. Versioning training

In [None]:
y_column = ["desempenho"]
X_columns = ["horas"]

run_name = f"LR_only_horas"
with mlflow.start_run(
    run_name=run_name,
    experiment_id=experiment.experiment_id,
    tags={
        "all_variables": "True",
    },
    description="Testando o MLFlow.",
) as run:
    print("starting training")

    # Setup variables
    y_variable = df[y_column]
    X_variables = df[X_columns]

    # Training
    lr_model = linear_model.LinearRegression()
    lr_model.fit(X=X_variables, y=y_variable)
    mlflow.sklearn.log_model(lr_model, "lr_model")

    # Evaluating
    df["predict"] = lr_model.predict(X_variables)
    y_pred = df["predict"]

    ## Calculating and logging metrics
    model_metrics = {}
    model_metrics["explained_variance"] = metrics.explained_variance_score(
        y_variable, y_pred
    )
    model_metrics["mse"] = metrics.mean_absolute_error(y_variable, y_pred)
    model_metrics["r2"] = metrics.r2_score(y_variable, y_pred)

    for metric, value in model_metrics.items():
        print(f"{metric}:", explained_variance)
        mlflow.log_metric(metric, value)

    # Visualization
    fig = px.scatter(
        df,
        y_column,
        "predict",
        "escola",
        title=f"y ~ {X_columns}: MSE = {mse}; R2 = {r2}",
    )
    mlflow.log_figure(fig, "scatter.html")