# EML 3.2 - Instrumentação de um experimento para tracking no MLflow

## AAPL stock price forecast

### Import libs

In [10]:
import pandas as pd
import yfinance as yf
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import itertools
import re

mlflow.set_experiment("eml3-MLflow_tracking")

2024/09/15 14:20:56 INFO mlflow.tracking.fluent: Experiment with name 'eml3-MLflow_tracking' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/letic/OneDrive/Documentos/Leticia/01.Academic/2.Post-graduation/MBA%20Machine%20Learning%20in%20Production/Engenharia%20de%20Machine%20Learning/EML3/EML3.2/notebooks/mlruns/149487568618316920', creation_time=1726420856520, experiment_id='149487568618316920', last_update_time=1726420856520, lifecycle_stage='active', name='eml3-MLflow_tracking', tags={}>

### Download the data

In [11]:
ticker = 'AAPL'
start_date = '2020-01-01'
end_date = '2023-01-01'
data = yf.download(ticker, start=start_date, end=end_date)
data = data.dropna()
data['Daily Return'] = data['Adj Close'].pct_change()
data = data.dropna()
data['7-Day MA'] = data['Adj Close'].rolling(window=7).mean()

processed_data_path = '../data/processed/processed_data.csv'
data.to_csv(processed_data_path, index=False)
print(f'Dados salvos em: {processed_data_path}')

[*********************100%***********************]  1 of 1 completed


### Load processed data

In [13]:
data = pd.read_csv(processed_data_path)
X = data[['7-Day MA']].dropna()
y = data.loc[X.index, 'Adj Close']

### Model

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
def safe_filename(base, params):
    params_str = "_".join([f"{k}-{v}" for k, v in params.items()])
    safe_str = re.sub(r'[^\w\-_\. ]', '_', params_str)  # Substitui caracteres inválidos por '_'
    return f"{base}_{safe_str}.pkl"

def grid_search(models_and_params):
    for model_name, (model, param_grid) in models_and_params.items():
        param_keys = param_grid.keys()
        param_combinations = list(itertools.product(*param_grid.values()))

        for param_combination in param_combinations:
            params = dict(zip(param_keys, param_combination))

            with mlflow.start_run():
                mlflow.log_param("model", model_name)
                mlflow.log_params(params)

                model.set_params(**params)

                model.fit(X_train, y_train)

                score = model.score(X_test, y_test)
                mlflow.log_metric("R2", score)

                predictions = model.predict(X_test)

                mse = mean_squared_error(y_test, predictions)
                mae = mean_absolute_error(y_test, predictions)

                mlflow.log_metric("MSE", mse)
                mlflow.log_metric("MAE", mae)

                model_path = f'../models/{safe_filename(model_name, params)}'
                joblib.dump(model, model_path)
                mlflow.sklearn.log_model(model, "model")
                
                print(f'{model_name} salvo em: {model_path} com hiperparâmetros: {params}')
                print(f'{model_name} - Hiperparâmetros: {params}, MSE: {mse}, MAE: {mae}, R2: {score}')

In [16]:
models_and_params = {
    "Linear Regression": (LinearRegression(), {}),
    "Decision Tree": (
        DecisionTreeRegressor(), 
        {"max_depth": [3, 5, 10], "min_samples_split": [2, 10]}
    ),
    "Random Forest": (
        RandomForestRegressor(), 
        {"n_estimators": [50, 100], "max_depth": [5, 10], "min_samples_split": [2, 5]}
    )
}

In [17]:
grid_search(models_and_params)



Linear Regression salvo em: ../models/Linear Regression_.pkl com hiperparâmetros: {}
Linear Regression - Hiperparâmetros: {}, MSE: 12.215358153037984, MAE: 2.627331747796991, R2: 0.9868074223289522




Decision Tree salvo em: ../models/Decision Tree_max_depth-3_min_samples_split-2.pkl com hiperparâmetros: {'max_depth': 3, 'min_samples_split': 2}
Decision Tree - Hiperparâmetros: {'max_depth': 3, 'min_samples_split': 2}, MSE: 32.38842101871362, MAE: 4.581575197652446, R2: 0.9650205295187593




Decision Tree salvo em: ../models/Decision Tree_max_depth-3_min_samples_split-10.pkl com hiperparâmetros: {'max_depth': 3, 'min_samples_split': 10}
Decision Tree - Hiperparâmetros: {'max_depth': 3, 'min_samples_split': 10}, MSE: 32.38842101871362, MAE: 4.581575197652446, R2: 0.9650205295187593




Decision Tree salvo em: ../models/Decision Tree_max_depth-5_min_samples_split-2.pkl com hiperparâmetros: {'max_depth': 5, 'min_samples_split': 2}
Decision Tree - Hiperparâmetros: {'max_depth': 5, 'min_samples_split': 2}, MSE: 16.124313894243375, MAE: 3.032703784170266, R2: 0.9825857530514358




Decision Tree salvo em: ../models/Decision Tree_max_depth-5_min_samples_split-10.pkl com hiperparâmetros: {'max_depth': 5, 'min_samples_split': 10}
Decision Tree - Hiperparâmetros: {'max_depth': 5, 'min_samples_split': 10}, MSE: 16.221489674382592, MAE: 3.042636420007397, R2: 0.9824808032815502




Decision Tree salvo em: ../models/Decision Tree_max_depth-10_min_samples_split-2.pkl com hiperparâmetros: {'max_depth': 10, 'min_samples_split': 2}
Decision Tree - Hiperparâmetros: {'max_depth': 10, 'min_samples_split': 2}, MSE: 20.730018289901338, MAE: 3.3959805360091817, R2: 0.9776115957481157




Decision Tree salvo em: ../models/Decision Tree_max_depth-10_min_samples_split-10.pkl com hiperparâmetros: {'max_depth': 10, 'min_samples_split': 10}
Decision Tree - Hiperparâmetros: {'max_depth': 10, 'min_samples_split': 10}, MSE: 18.843866755830682, MAE: 3.205999266435142, R2: 0.9796486379945115




Random Forest salvo em: ../models/Random Forest_n_estimators-50_max_depth-5_min_samples_split-2.pkl com hiperparâmetros: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 2}
Random Forest - Hiperparâmetros: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 2}, MSE: 14.630386478527823, MAE: 2.852260432667545, R2: 0.9841991935433




Random Forest salvo em: ../models/Random Forest_n_estimators-50_max_depth-5_min_samples_split-5.pkl com hiperparâmetros: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5}
Random Forest - Hiperparâmetros: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5}, MSE: 14.88135193879702, MAE: 2.889845707051707, R2: 0.9839281510338728




Random Forest salvo em: ../models/Random Forest_n_estimators-50_max_depth-10_min_samples_split-2.pkl com hiperparâmetros: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2}
Random Forest - Hiperparâmetros: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2}, MSE: 19.181733262555817, MAE: 3.2504115015384683, R2: 0.9792837424198936




Random Forest salvo em: ../models/Random Forest_n_estimators-50_max_depth-10_min_samples_split-5.pkl com hiperparâmetros: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5}
Random Forest - Hiperparâmetros: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 5}, MSE: 18.403011059320235, MAE: 3.1649245437789957, R2: 0.9801247618170857




Random Forest salvo em: ../models/Random Forest_n_estimators-100_max_depth-5_min_samples_split-2.pkl com hiperparâmetros: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2}
Random Forest - Hiperparâmetros: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 2}, MSE: 15.01038565664444, MAE: 2.8858007217117323, R2: 0.9837887947150846




Random Forest salvo em: ../models/Random Forest_n_estimators-100_max_depth-5_min_samples_split-5.pkl com hiperparâmetros: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5}
Random Forest - Hiperparâmetros: {'n_estimators': 100, 'max_depth': 5, 'min_samples_split': 5}, MSE: 15.04221063467273, MAE: 2.8967043012186573, R2: 0.9837544237626118




Random Forest salvo em: ../models/Random Forest_n_estimators-100_max_depth-10_min_samples_split-2.pkl com hiperparâmetros: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2}
Random Forest - Hiperparâmetros: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2}, MSE: 19.362915687942237, MAE: 3.2050385777158206, R2: 0.9790880655359584




Random Forest salvo em: ../models/Random Forest_n_estimators-100_max_depth-10_min_samples_split-5.pkl com hiperparâmetros: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5}
Random Forest - Hiperparâmetros: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5}, MSE: 18.028861731026506, MAE: 3.129632211747619, R2: 0.9805288428118667
