In [1]:
pip install numpy pandas scikit-learn mlflow matplotlib

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-3.0.5-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.5-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.5
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Robust Integrated Covid Trainer (MLflow server fallback safe)

- MLflow 서버가 없으면 자동으로 file:/root/mlruns 로 폴백 (레지스트리 비활성)
- CSV/TSV/Excel 로더(구분자/인코딩 견고화 + 잘못된 경로 자동 보정)
- 최소 FE(시간/주기 + lag/roll/diff/pct) 자동 생성 -> 미래 피처 화이트리스트 통과
- 최근 N일 고정 평가 + (옵션)TS CV
- 동적 재귀 예측(H-step)
- 베스트 모델 저장(레지스트리 가능 시 등록, 불가 시 아티팩트 저장)

pip install numpy pandas scikit-learn mlflow matplotlib
# (optional) pip install xgboost
"""

import os, sys, glob, json, math, argparse, tempfile
from dataclasses import dataclass
from datetime import datetime
from typing import List, Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature


# ========================= MLflow Safe Init =========================

def setup_mlflow_or_fallback(experiment_name: str, preferred_uri: Optional[str] = None) -> bool:
    """
    1) preferred_uri (또는 환경변수 MLFLOW_TRACKING_URI)가 있으면 먼저 시도
    2) 연결/권한/유효성 실패 시 file:/root/mlruns 로 폴백
    반환값: registry_enabled (True면 레지스트리 사용 가능, False면 아티팩트 저장만)
    """
    uri = preferred_uri or os.getenv("MLFLOW_TRACKING_URI", "").strip() or None

    def try_uri(u: str) -> bool:
        try:
            mlflow.set_tracking_uri(u)
            # 간단한 호출로 연결성 점검
            _ = mlflow.get_experiment_by_name(experiment_name)
            mlflow.set_experiment(experiment_name)
            return True
        except Exception:
            return False

    # 1) 사용자가 지정한 URI 먼저 시도 (HTTP/HTTPS/File 모두 허용)
    if uri and try_uri(uri):
        # 레지스트리 사용 가능 여부 판정:
        # - HTTP(S)인 경우 대체로 가능, file: 인 경우 대체로 불가
        reg_enabled = uri.startswith("http")
        if not reg_enabled:
            print(f"[MLflow] Using '{uri}' (file store). Model registry disabled.")
        else:
            print(f"[MLflow] Connected to '{uri}'. Model registry enabled.")
        return reg_enabled

    # 2) 폴백: 로컬 파일 스토어
    local_dir = "/root/mlruns"
    os.makedirs(local_dir, exist_ok=True)
    local_uri = f"file:{local_dir}"
    mlflow.set_tracking_uri(local_uri)
    mlflow.set_experiment(experiment_name)
    print(f"[MLflow] Fallback to {local_uri}. Model registry disabled.")
    return False


# ========================= Robust Loader =========================

DEFAULT_CANDIDATES = [
    "/mnt/data/covid_processed.csv",
    "./covid_processed.csv",
    "./data/covid_processed.csv",
    "./dataset/covid_processed.csv",
]

def _read_head_bytes(path: str, nbytes: int = 2048) -> str:
    with open(path, "rb") as f:
        raw = f.read(nbytes)
    for enc in ("utf-8","utf-8-sig","cp949","euc-kr","latin1"):
        try:
            return raw.decode(enc, errors="ignore")
        except Exception:
            continue
    return raw.decode("utf-8", errors="ignore")

def _looks_like_json(text_head: str) -> bool:
    s = text_head.strip()
    return (s.startswith("{") and ":" in s) or (s.startswith("[") and "{" in s)

def resolve_table_path(path: str) -> str:
    """경로가 파일이 아니거나 JSON이어도 CSV/TSV/Excel을 최대한 찾아 반환."""
    if os.path.exists(path):
        if os.path.isdir(path):
            for pat in ("*.csv","*.tsv","*.xlsx","*.xls"):
                m = glob.glob(os.path.join(path, pat))
                if m: return m[0]
            raise FileNotFoundError(f"No table file found in directory: {path}")
        else:
            head = _read_head_bytes(path)
            if not _looks_like_json(head):
                return path
            # JSON이면 계속 폴백 진행

    for cand in DEFAULT_CANDIDATES:
        if os.path.exists(cand):
            head = _read_head_bytes(cand)
            if not _looks_like_json(head):
                print(f"[auto] Using fallback file: {cand}")
                return cand

    for pat in ("*.csv","*.tsv","*.xlsx","*.xls"):
        for m in glob.glob(pat):
            head = _read_head_bytes(m)
            if not _looks_like_json(head):
                print(f"[auto] Using discovered file: {m}")
                return m

    raise FileNotFoundError(
        f"Could not locate a valid table file. Tried '{path}' and fallbacks: {DEFAULT_CANDIDATES}"
    )

def read_table_robust(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in {".xlsx",".xls"}:
        return pd.read_excel(path)

    # auto-sep
    try:
        df = pd.read_csv(path, sep=None, engine="python")
        if len(df.columns) > 1: return df
    except Exception:
        pass

    # enc × sep 재시도
    for enc in ("utf-8","utf-8-sig","cp949","euc-kr","latin1"):
        for sep in ("\t",",",";","|"):
            try:
                df = pd.read_csv(path, sep=sep, engine="python", encoding=enc)
                if len(df.columns) > 1: return df
            except Exception:
                continue

    # 단일 컬럼 분해 시도
    try:
        df = pd.read_csv(path, engine="python", encoding="utf-8", header=None)
        if len(df.columns) == 1:
            series = df.iloc[:,0].astype(str)
            if series.str.contains("\t").any():
                return series.str.split("\t", expand=True)
            if series.str.contains(",").any():
                return series.str.split(",", expand=True)
    except Exception:
        pass

    head = _read_head_bytes(path)
    raise ValueError(f"Failed to parse table file. Head: {head[:200]} ...")


# ========================= Minimal FE =========================

DATE_FEATS = ["dow_sin","dow_cos","weekofyear","dayofyear","month_sin","month_cos"]

def add_time_features(df: pd.DataFrame, date_col="date") -> pd.DataFrame:
    df = df.copy()
    if date_col not in df.columns:
        for cand in ["Date","ds","DATE","날짜"]:
            if cand in df.columns:
                df.rename(columns={cand:"date"}, inplace=True)
                break
    if "date" not in df.columns:
        raise ValueError("Missing 'date' column after rename attempts.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    if df["date"].isna().all():
        raise ValueError("All 'date' values failed to parse. Check date format.")
    df["dow"] = df["date"].dt.dayofweek
    df["weekofyear"] = df["date"].dt.isocalendar().week.astype(int)
    df["dayofyear"] = df["date"].dt.dayofyear
    df["month"] = df["date"].dt.month
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7);  df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)
    df["month_sin"] = np.sin(2*np.pi*df["month"]/12); df["month_cos"] = np.cos(2*np.pi*df["month"]/12)
    return df

def add_lag_roll(df: pd.DataFrame, target: str, lags=(1,7,14), rolls=(7,14,28)) -> pd.DataFrame:
    if target not in df.columns:
        raise ValueError(f"Target '{target}' not found.")
    df = df.sort_values("date").reset_index(drop=True)
    for l in lags:
        df[f"{target}_lag{l}"] = df[target].shift(l)
    for w in rolls:
        df[f"{target}_rollmean{w}"] = df[target].shift(1).rolling(w, min_periods=1).mean()
        df[f"{target}_rollstd{w}"]  = df[target].shift(1).rolling(w, min_periods=1).std()
    df[f"{target}_diff1"] = df[target].diff(1)
    df[f"{target}_pct"]   = df[target].pct_change().replace([np.inf,-np.inf], np.nan)
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].interpolate("linear", limit_direction="both")
    df = df.bfill().ffill()
    return df

def select_future_aware_features(df: pd.DataFrame, target: str) -> List[str]:
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    allowed = []
    for c in num_cols:
        if c in DATE_FEATS or c.startswith(f"{target}_"):
            allowed.append(c)
    allowed = [c for c in allowed if c not in {target,"y_next"} and c in df.columns]
    return sorted(list(dict.fromkeys(allowed)))


# ========================= Plots =========================

def _lineplot(dates, series, title, fname):
    fig = plt.figure(); plt.plot(dates, series)
    plt.title(title); plt.xlabel("date"); plt.ylabel("value")
    mlflow.log_figure(fig, fname); plt.close(fig)

def _dualplot(dates, y_true, y_pred, title, fname):
    fig = plt.figure()
    plt.plot(dates, y_true, label="actual")
    plt.plot(dates, y_pred, label="pred")
    plt.legend(); plt.title(title); plt.xlabel("date"); plt.ylabel("value")
    mlflow.log_figure(fig, fname); plt.close(fig)

def _residplot(y_true, y_pred, title, fname):
    res = np.array(y_true) - np.array(y_pred)
    fig = plt.figure(); plt.scatter(y_pred, res, s=8); plt.axhline(0)
    plt.title(title); plt.xlabel("pred"); plt.ylabel("residuals")
    mlflow.log_figure(fig, fname); plt.close(fig)


# ========================= Dynamic Forecast =========================

def _roll_stats(seq: List[float], w: int) -> Tuple[float,float]:
    arr = np.array(seq[-w:], dtype=float)
    return float(np.mean(arr)), float(np.std(arr, ddof=0))

def _date_feats(ts: pd.Timestamp) -> Dict[str,float]:
    dow = ts.dayofweek
    weekofyear = int(ts.isocalendar().week)
    dayofyear = int(ts.timetuple().tm_yday)
    month = int(ts.month)
    return {
        "dow_sin": math.sin(2*math.pi*dow/7.0),
        "dow_cos": math.cos(2*math.pi*dow/7.0),
        "weekofyear": float(weekofyear),
        "dayofyear": float(dayofyear),
        "month_sin": math.sin(2*math.pi*month/12.0),
        "month_cos": math.cos(2*math.pi*month/12.0),
    }

def recursive_forecast_dynamic(df_feat: pd.DataFrame, target: str,
                               feat_list: List[str], model, horizon: int,
                               lookbacks: List[int], rolls: List[int]) -> pd.DataFrame:
    df = df_feat.sort_values("date").copy()
    last_date = pd.to_datetime(df["date"].max())
    hist = df[target].astype(float).tolist()
    preds, dates = [], []
    for i in range(1, horizon+1):
        nd = last_date + pd.Timedelta(days=i)
        row: Dict[str,float] = {}
        row.update(_date_feats(nd))
        for l in lookbacks:
            row[f"{target}_lag{l}"] = float(hist[-l]) if len(hist) >= l else float(hist[-1])
        for w in rolls:
            m,s = _roll_stats(hist, min(len(hist), w))
            row[f"{target}_rollmean{w}"] = m
            row[f"{target}_rollstd{w}"] = s
        if len(hist) >= 2:
            diff1 = hist[-1] - hist[-2]
            base = hist[-2] if hist[-2] != 0 else 1e-9
            pct = (hist[-1] - base) / max(abs(base), 1e-9)
        else:
            diff1, pct = 0.0, 0.0
        row[f"{target}_diff1"] = float(diff1)
        row[f"{target}_pct"]   = float(pct)
        x = np.array([[row.get(c,0.0) for c in feat_list]], dtype=np.float32)
        yhat = float(model.predict(x)[0])
        preds.append(yhat); dates.append(nd); hist.append(yhat)
    return pd.DataFrame({"date": dates, "yhat": preds})


# ========================= Metrics =========================

def _metrics(y_true, y_pred) -> Dict[str,float]:
    rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
    mae  = float(mean_absolute_error(y_true, y_pred))
    r2   = float(r2_score(y_true, y_pred))
    mape = float(np.mean(np.abs((y_true - y_pred)/np.maximum(1e-9, np.abs(y_true))))*100.0)
    return {"rmse": rmse, "mae": mae, "r2": r2, "mape": mape}


# ========================= Trainer =========================

@dataclass
class TrainConfig:
    experiment_name: str = "covid_model_training_integrated"
    target_col: str = "new_cases"
    test_days: int = 60
    horizon: int = 30
    lookbacks: List[int] = None
    rolls: List[int] = None
    do_time_series_cv: bool = True
    cv_splits: int = 3
    random_state: int = 42

    def __post_init__(self):
        if self.lookbacks is None: self.lookbacks = [1,7,14]
        if self.rolls is None:     self.rolls     = [7,14,28]

class IntegratedCovidTrainer:
    def __init__(self, cfg: Optional[TrainConfig] = None, tracking_uri: Optional[str] = None):
        self.cfg = cfg or TrainConfig()
        # 안전한 초기화 (서버 없으면 파일 모드로 자동 폴백)
        self.registry_enabled = setup_mlflow_or_fallback(self.cfg.experiment_name, tracking_uri)

        self.models: Dict[str,Any] = {
            "random_forest": RandomForestRegressor(random_state=self.cfg.random_state, n_estimators=400, max_depth=12, min_samples_leaf=2, n_jobs=-1),
            "gradient_boosting": GradientBoostingRegressor(random_state=self.cfg.random_state, n_estimators=300, learning_rate=0.05, max_depth=3),
            "linear_regression": LinearRegression(),
            "ridge_regression": Ridge(random_state=self.cfg.random_state),
        }
        try:
            from xgboost import XGBRegressor
            self.models["xgboost"] = XGBRegressor(
                n_estimators=600, max_depth=6, learning_rate=0.05, subsample=0.9,
                colsample_bytree=0.9, reg_lambda=1.0, tree_method="hist",
                random_state=self.cfg.random_state, n_jobs=-1
            )
        except Exception:
            pass

    def train(self, df_feat: pd.DataFrame, target: str) -> Dict[str,Any]:
        with mlflow.start_run(run_name="model_training_integrated"):
            if "date" not in df_feat.columns:
                raise ValueError("features_df must include 'date' column")

            feat_list = select_future_aware_features(df_feat, target)
            if not feat_list:
                raise ValueError("No future-aware features. FE must add lag/roll/diff/pct and date features.")
            mlflow.log_text("\n".join(feat_list), "features_used.txt")

            df = df_feat.sort_values("date").copy()
            df["y_next"] = df[target].shift(-1)
            df = df.dropna(subset=["y_next"]).reset_index(drop=True)
            X_all = df[feat_list].astype(np.float32)
            y_all = df["y_next"].astype(np.float32)
            dt_all = pd.to_datetime(df["date"]).reset_index(drop=True)

            cutoff = dt_all.max() - pd.Timedelta(days=self.cfg.test_days-1)
            train_mask = dt_all < cutoff
            test_mask  = ~train_mask
            X_train, X_test = X_all[train_mask].reset_index(drop=True), X_all[test_mask].reset_index(drop=True)
            y_train, y_test = y_all[train_mask].reset_index(drop=True), y_all[test_mask].reset_index(drop=True)
            dt_train, dt_test = dt_all[train_mask].reset_index(drop=True), dt_all[test_mask].reset_index(drop=True)

            mlflow.log_params({
                "target": target,
                "test_days": self.cfg.test_days,
                "horizon": self.cfg.horizon,
                "lookbacks": ",".join(map(str,self.cfg.lookbacks)),
                "rolls": ",".join(map(str,self.cfg.rolls)),
                "train_rows": len(X_train),
                "test_rows": len(X_test),
                "feature_count": len(feat_list),
                "models": list(self.models.keys()),
            })

            results: Dict[str,Any] = {}
            for name, model in self.models.items():
                try:
                    model.fit(X_train, y_train)
                    ytr = model.predict(X_train)
                    yte = model.predict(X_test)
                    m_tr = _metrics(y_train, ytr)
                    m_te = _metrics(y_test, yte)
                    for k,v in m_tr.items(): mlflow.log_metric(f"{name}_train_{k}", v)
                    for k,v in m_te.items(): mlflow.log_metric(f"{name}_test_{k}",  v)

                    if self.cfg.do_time_series_cv:
                        try:
                            tscv = TimeSeriesSplit(n_splits=self.cfg.cv_splits)
                            cv = cross_val_score(model, X_train, y_train, cv=tscv, scoring="neg_root_mean_squared_error")
                            mlflow.log_metric(f"{name}_cv_rmse_mean", float((-cv).mean()))
                            mlflow.log_metric(f"{name}_cv_rmse_std",  float((-cv).std()))
                        except Exception:
                            pass

                    _dualplot(dt_test, y_test, yte, f"{name}: actual vs pred (test)", f"{name}_test_pred.png")
                    _residplot(y_test, yte, f"{name}: residuals (test)", f"{name}_test_resid.png")

                    results[name] = {"model": model, "metrics": {"train": m_tr, "test": m_te}, "pred": {"train": ytr, "test": yte}}
                except Exception as e:
                    mlflow.log_text(str(e), f"{name}_error.txt")

            if not results:
                raise RuntimeError("No models trained successfully.")

            best = min(results, key=lambda n: results[n]["metrics"]["test"]["rmse"])
            best_model = results[best]["model"]
            mlflow.log_param("best_model_name", best)
            mlflow.log_metric("best_model_test_rmse", results[best]["metrics"]["test"]["rmse"])
            mlflow.log_metric("best_model_test_r2",   results[best]["metrics"]["test"]["r2"])

            # comparison
            comp = {}
            for n,r in results.items():
                comp[n] = {
                    "test_rmse": r["metrics"]["test"]["rmse"],
                    "test_mae":  r["metrics"]["test"]["mae"],
                    "test_r2":   r["metrics"]["test"]["r2"],
                    "train_rmse": r["metrics"]["train"]["rmse"],
                    "train_r2":   r["metrics"]["train"]["r2"],
                }
            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
                json.dump(comp, f, indent=2)
                mlflow.log_artifact(f.name, "analysis/model_comparison.json"); tmp=f.name
            os.remove(tmp)

            # predictions
            dfp = pd.DataFrame({
                "split": ["train"]*len(y_train) + ["test"]*len(y_test),
                "actual": np.concatenate([y_train.values, y_test.values]),
                "predicted": np.concatenate([results[best]["pred"]["train"], results[best]["pred"]["test"]]),
            })
            dfp["residual"] = dfp["actual"] - dfp["predicted"]
            dfp["abs_residual"] = np.abs(dfp["residual"])
            with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
                dfp.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name, "results/predictions_detailed.csv"); tmp=f.name
            os.remove(tmp)

            # importance
            try:
                if hasattr(best_model,"feature_importances_"):
                    imp = pd.DataFrame({"feature": X_train.columns, "importance": best_model.feature_importances_}) \
                          .sort_values("importance", ascending=False)
                    with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
                        imp.to_csv(f.name, index=False)
                        mlflow.log_artifact(f.name, "analysis/feature_importance.csv"); tmp=f.name
                    os.remove(tmp)
            except Exception:
                pass

            # samples
            tr = X_train.head(20).copy(); tr["target"] = y_train.head(20).values; tr["split"]="train"
            te = X_test.head(10).copy();  te["target"] = y_test.head(10).values;  te["split"]="test"
            samp = pd.concat([tr,te], ignore_index=True)
            with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
                samp.to_csv(f.name, index=False)
                mlflow.log_artifact(f.name, "data/training_samples.csv"); tmp=f.name
            os.remove(tmp)

            # register or artifact-only
            try:
                sig = infer_signature(X_train, best_model.predict(X_train))
                if self.registry_enabled:
                    info = mlflow.sklearn.log_model(
                        sk_model=best_model, name="best_model",
                        signature=sig, input_example=X_train.head(3),
                        registered_model_name="covid_prediction_model",
                        serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
                    )
                    mlflow.log_text(info.model_uri, "model_uri.txt")
                else:
                    info = mlflow.sklearn.log_model(
                        sk_model=best_model, name="best_model",
                        signature=sig, input_example=X_train.head(3),
                    )
                    mlflow.log_text(info.model_uri, "model_uri_artifact_only.txt")
            except Exception as e2:
                mlflow.log_text(f"model_save_failed: {e2}", "model_save_failed.txt")

            # forecast
            try:
                df_fore = recursive_forecast_dynamic(df_feat, target, list(X_train.columns),
                                                     best_model, self.cfg.horizon,
                                                     self.cfg.lookbacks, self.cfg.rolls)
                with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
                    df_fore.to_csv(f.name, index=False)
                    mlflow.log_artifact(f.name, "forecast/forecast_future.csv"); tmp=f.name
                os.remove(tmp)
                _lineplot(df_fore["date"], df_fore["yhat"], f"{best} forecast (future)", f"{best}_forecast.png")
            except Exception as e:
                mlflow.log_text(str(e), "forecast_error.txt")

            # summary
            summ = {
                "timestamp": datetime.now().isoformat(),
                "total_models_trained": len(results),
                "best_model": best,
                "best_performance": results[best]["metrics"]["test"],
            }
            with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
                json.dump(summ, f, indent=2)
                mlflow.log_artifact(f.name, "reports/training_summary.json"); tmp=f.name
            os.remove(tmp)

            return {"best_model": best, "metrics": results[best]["metrics"]}


# ========================= CLI =========================

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--features_path", type=str, required=True, help="CSV/TSV/Excel 파일 경로 또는 디렉터리")
    ap.add_argument("--target", type=str, default="new_cases")
    ap.add_argument("--tracking_uri", type=str, default=None, help="예: http://mlflow:8080 (없으면 자동 폴백)")
    ap.add_argument("--test_days", type=int, default=60)
    ap.add_argument("--horizon", type=int, default=30)
    args = ap.parse_args()

    # 0) 경로 자동 복구
    table_path = resolve_table_path(args.features_path)
    print(f"[info] Using table file: {table_path}")

    # 1) 로드
    df = read_table_robust(table_path)

    # 2) date/target 확인 및 FE 보강
    df = add_time_features(df, "date")
    if args.target not in df.columns:
        raise ValueError(f"Target '{args.target}' not found. Columns: {list(df.columns)[:30]}")
    df = add_lag_roll(df, args.target, lags=(1,7,14), rolls=(7,14,28))

    # 3) 학습
    cfg = TrainConfig(target_col=args.target, test_days=args.test_days, horizon=args.horizon)
    trainer = IntegratedCovidTrainer(cfg, tracking_uri=args.tracking_uri)
    res = trainer.train(df, target=args.target)
    print("DONE. Best:", res["best_model"], "Test metrics:", res["metrics"]["test"])


if __name__ == "__main__":
    main()


[auto] Using fallback file: ./covid_processed.csv
[info] Using table file: ./covid_processed.csv


2025/09/26 08:09:54 INFO mlflow.tracking.fluent: Experiment with name 'covid_model_training_integrated' does not exist. Creating a new experiment.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



[MLflow] Fallback to file:/root/mlruns. Model registry disabled.


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


DONE. Best: random_forest Test metrics: {'rmse': 0.001485120418831939, 'mae': 0.0014851204188319391, 'r2': 0.0, 'mape': 148512046.0834014}


