In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import pickle
import mlflow
import xgboost as xgb
import pathlib
#from prefect import flow, task

In [3]:
def load_data():
    df= pd.read_csv("dataset/flight_dataset.csv")
    categorical = ["Airline", "Source", "Destination"]
    numerical = ["Total_Stops","Duration_hours","Duration_min"] 
    df = df[categorical + numerical]
    df.Duration_hours = df.Duration_hours *60
    df["duration"] = df["Duration_hours"] + df["Duration_min"]
    return df

In [6]:
def data_transformation(df):
    target = df["duration"].values
    df = df.drop(["Duration_hours", "Duration_min", "duration"], axis = 1)
    df = df.to_dict(orient = "records")
    dv = DictVectorizer()
    data_df = dv.fit_transform(df)
    train_df = data_df[:8000]
    test_df = data_df[8000:]
    y_train =  target[:8000]
    y_test = target[8000:]
    return train_df, test_df, y_train, y_test, dv




In [11]:
def train_model(
    X_train,
    X_val,
    y_train,
    y_val,
    dv
) -> None:
    """train a model with best hyperparams and write everything out"""

    with mlflow.start_run():
        train = xgb.DMatrix(X_train, label=y_train)
        valid = xgb.DMatrix(X_val, label=y_val)

        best_params = {
            "learning_rate": 0.16968989909872087,
            "max_depth": 25,
            "min_child_weight": 5.591678840975327,
            "objective": "reg:linear",
            "reg_alpha": 0.11973660565878817,
            "reg_lambda": 0.020803099001553724,
            "seed": 42,
        }

        mlflow.log_params(best_params)

        booster = xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=5,
        )

        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred,squared=False)
        mlflow.log_metric("rmse", rmse)

        pathlib.Path("models").mkdir(exist_ok=True)
        with open("models/preprocessor.b", "wb") as f_out:
            pickle.dump(dv, f_out)
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
    return None

In [13]:
def main_flow():

    mlflow.set_tracking_uri("sqlite:///mlflow.db")
    
    mlflow.set_experiment("Flight prediction time")
    

    data = load_data()
    X_train, X_test, y_train, y_test, dv = data_transformation(data)

    train_model(X_train, X_test, y_train, y_test, dv )


if __name__ == "__main__":
    main_flow()

[0]	validation-rmse:462.59652
[1]	validation-rmse:419.41864
[2]	validation-rmse:386.83512
[3]	validation-rmse:362.64709
[4]	validation-rmse:344.96112
[5]	validation-rmse:332.21034
[6]	validation-rmse:323.10859
[7]	validation-rmse:316.67384
[8]	validation-rmse:312.14459
[9]	validation-rmse:308.97998
[10]	validation-rmse:306.76888
[11]	validation-rmse:305.22975
[12]	validation-rmse:304.16134
[13]	validation-rmse:303.41721
[14]	validation-rmse:302.92305
[15]	validation-rmse:302.58464
[16]	validation-rmse:302.35317
[17]	validation-rmse:302.16594
[18]	validation-rmse:302.03975
[19]	validation-rmse:301.94045
[20]	validation-rmse:301.86893
[21]	validation-rmse:301.81737
[22]	validation-rmse:301.77959
[23]	validation-rmse:301.77473
[24]	validation-rmse:301.72869
[25]	validation-rmse:301.69611
[26]	validation-rmse:301.67513
[27]	validation-rmse:301.68706
[28]	validation-rmse:301.70255
[29]	validation-rmse:301.71751
[30]	validation-rmse:301.72523
[31]	validation-rmse:301.73002




In [None]:
mlflow.end_run() 