In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import pickle
import mlflow
import xgboost as xgb
from prefect import flow, task

In [2]:
def load_data():
    df= pd.read_csv("dataset/flight_dataset.csv")
    categorical = ["Airline", "Source", "Destination"]
    numerical = ["Total_Stops","Duration_hours","Duration_min"] 
    df = df[categorical + numerical]
    df.Duration_hours = df.Duration_hours *60
    df["duration"] = df["Duration_hours"] + df["Duration_min"]
    return df

In [9]:
def data_transformation(df):
    target = df["duration"].values
    df = df.drop(["Duration_hours", "Duration_min", "duration"], axis = 1)
    df = df.to_dict(orient = "records")
    dv = DictVectorizer()
    data_df = dv.fit_transform(df)
    train_df = data_df[:8000]
    test_df = data_df[8000:]
    y_train =  target[:8000]
    y_test = target[8000:]
    return train_df, test_df, y_train, y_test




In [11]:
def train_model(
    X_train: scipy.sparse._csr.csr_matrix,
    X_val: scipy.sparse._csr.csr_matrix,
    y_train: np.ndarray,
    y_val: np.ndarray,
    dv: sklearn.feature_extraction.DictVectorizer,
) -> None:
    """train a model with best hyperparams and write everything out"""

    with mlflow.start_run():
        train = xgb.DMatrix(X_train, label=y_train)
        valid = xgb.DMatrix(X_val, label=y_val)

        best_params = {
            "learning_rate": 0.09585355369315604,
            "max_depth": 30,
            "min_child_weight": 1.060597050922164,
            "objective": "reg:linear",
            "reg_alpha": 0.018060244040060163,
            "reg_lambda": 0.011658731377413597,
            "seed": 42,
        }

        mlflow.log_params(best_params)

        booster = xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=20,
        )

        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

        pathlib.Path("models").mkdir(exist_ok=True)
        with open("models/preprocessor.b", "wb") as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
    return None