In [13]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
import os
import pickle
from sklearn.pipeline import make_pipeline
import mlflow
import uuid


In [14]:
output_file = 'prediction.csv'
input_file = "dataset/flight_dataset.csv"

RUN_ID="16aa4ec2992e4def9a579a09802f7d54"


In [21]:
def load_data(input_file):
    df= pd.read_csv(input_file)
    categorical = ["Airline", "Source", "Destination"]
    numerical = ["Total_Stops","Duration_hours","Duration_min"]
    df = df[categorical + numerical]
    ride_list = generate_id(df)
    df['ride_id'] = ride_list
    return df

In [22]:
def preprocess_data_to_dict(df):
    df.Duration_hours = df.Duration_hours *60
    df["duration"] = df["Duration_hours"] + df["Duration_min"]
    target = df["duration"].values
    df = df.drop(["Duration_hours", "Duration_min", "duration"], axis = 1)
    df = df.to_dict(orient = "records")
    return df

In [23]:
def generate_id(df):
    ride_list = []
    for i in range(len(df)):
        ride_id = str(uuid.uuid4())
        ride_list.append(ride_id)
    return ride_list

In [24]:
def load_model(RUN_ID):
    logged_model = f's3://mlop-zoomcamp-adebayo/3/{RUN_ID}/artifacts/model'
    # Load model as a PyFuncModel.
    model = mlflow.pyfunc.load_model(logged_model)
    return model

In [27]:
def apply_model(RUN_ID, output_file, input_file):
    data = load_data(input_file)
    dict_df = preprocess_data_to_dict(data)
    model = load_model(RUN_ID)

    pred = model.predict(dict_df)
    df_result  = pd.DataFrame()

    df_result['ride_id'] =  data['ride_id'] 
    df_result['Airline'] = data['Airline']
    df_result['Source'] = data['Source']
    df_result['Destination'] = data['Destination']
    df_result['Total_Stops'] = data['Total_Stops']
    df_result['duration'] = data['duration']
    df_result['predicted_duration'] = pred
    df_result['Loss'] = df_result['predicted_duration'] - df_result['duration']
    df_result['model_version'] = RUN_ID

    df_result.to_csv(output_file)


In [28]:
apply_model(RUN_ID, output_file, input_file)

Downloading artifacts: 100%|██████████| 5/5 [00:01<00:00,  3.71it/s]


In [39]:
df_result

Unnamed: 0,ride_id,Airline,Source,Destination,Total_Stops,duration,predicted_duration,Loss,model_version
0,fb08e8ee-cdec-4834-80d7-4838005fb696,IndiGo,Banglore,New Delhi,0,170,114.946582,-55.053418,16aa4ec2992e4def9a579a09802f7d54
1,b8f9d39b-be9c-4e8a-8f48-efc43d6ddfe9,Air India,Kolkata,Banglore,2,445,1332.554601,887.554601,16aa4ec2992e4def9a579a09802f7d54
2,94bb1059-ca08-4c63-bddf-947df36a5651,Jet Airways,Delhi,Cochin,2,1140,1292.153894,152.153894,16aa4ec2992e4def9a579a09802f7d54
3,e3b894f9-4e95-4060-a98a-e34028d706c1,IndiGo,Kolkata,Banglore,1,325,623.563443,298.563443,16aa4ec2992e4def9a579a09802f7d54
4,d23de8f4-81fc-4d22-a08d-92c415d0def7,IndiGo,Banglore,New Delhi,1,285,559.176629,274.176629,16aa4ec2992e4def9a579a09802f7d54
...,...,...,...,...,...,...,...,...,...
10678,e1a2c279-a0f2-4520-a2a6-e574c22deb08,Air Asia,Kolkata,Banglore,0,150,189.645022,39.645022,16aa4ec2992e4def9a579a09802f7d54
10679,a06a7431-d5ef-42b1-a76a-2ee3d53d64ee,Air India,Kolkata,Banglore,0,155,444.094507,289.094507,16aa4ec2992e4def9a579a09802f7d54
10680,2c809607-7f98-4d0b-b3a0-be4dd1f15cbb,Jet Airways,Banglore,Delhi,0,180,314.816597,134.816597,16aa4ec2992e4def9a579a09802f7d54
10681,16595207-9c68-4262-860c-4890ee1dd908,Vistara,Banglore,New Delhi,0,160,394.658176,234.658176,16aa4ec2992e4def9a579a09802f7d54


NameError: name 'output_file' is not defined