In [69]:
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.bash import BashOperator

from datetime import datetime, timedelta
import sys
import importlib.util
from utils import *
from constants import *

In [70]:
def module_from_file(module_name, file_path):
    spec = importlib.util.spec_from_file_location(module_name, file_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module

In [71]:
utils = module_from_file("utils", "/home/Assignment/03_inference_pipeline/scripts/utils.py")
constants = module_from_file("utils", "/home/Assignment/03_inference_pipeline/scripts/constants.py") 

In [72]:
db_path=constants.DB_PATH
db_file_name=constants.DB_FILE_NAME

db_file_mlflow=constants.DB_FILE_MLFLOW
tracking_uri=constants.TRACKING_URI

scripts_output= constants.SCRIPTS_OUTPUT


model_name=constants.MODEL_NAME
model_stage=constants.STAGE

one_hot_encoded_features=constants.ONE_HOT_ENCODED_FEATURES
features_to_encode=constants.FEATURES_TO_ENCODE

In [54]:
db_path


'/home/airflow/dags/Lead_scoring_data_pipeline/'

In [55]:
db_file_name


'lead_scoring_data_cleaning.db'

In [56]:
db_file_mlflow


'/home/Assignment/02_training_pipeline/scripts/Lead_scoring_mlflow_production.db'

In [57]:
model_name,model_stage,tracking_uri

('LightGBM', 'production', 'http://0.0.0.0:6006')

In [58]:
one_hot_encoded_features

['total_leads_droppped',
 'city_tier',
 'first_platform_c_Level8',
 'first_platform_c_others',
 'first_platform_c_Level2',
 'first_utm_medium_c_others',
 'first_utm_medium_c_Level13',
 'first_utm_medium_c_Level0',
 'first_platform_c_Level7',
 'first_platform_c_Level0']

In [59]:
features_to_encode

['first_platform_c', 'first_utm_medium_c', 'first_utm_source_c']

In [60]:
def load_data(file_path_list):
    data = []
    for eachfile in file_path_list:
        data.append(pd.read_csv(eachfile, index_col=0))
    return data

def __read_input_data(db_path, db_file_name, table_name):
    cnx = sqlite3.connect(db_path + db_file_name)
    df = pd.read_sql('select * from '+ table_name, cnx)
    df.drop(columns=['level_0', 'index'], axis = 1, inplace=True, errors='ignore')
    cnx.close()
    print("Data has been extracted successfully from lead_scoring_model_experimentation.")
    return df

def __save_data_to_db(db_path, db_file_name, input_data, table):
    cnx = sqlite3.connect(db_path + db_file_name)
    input_data.to_sql(name=table, con=cnx, if_exists='replace')
    print('input_data has been saved successfully to table ' + table);
    cnx.close()

In [61]:
encode_features(db_path, db_file_name, one_hot_encoded_features, features_to_encode)

Data has been extracted successfully from lead_scoring_model_experimentation.
input_data has been saved successfully to table features


In [62]:
input_data = __read_input_data(db_path, db_file_name, 'features')
source_cols = input_data.columns.to_list()

Data has been extracted successfully from lead_scoring_model_experimentation.


In [63]:
source_cols

['first_platform_c',
 'first_utm_medium_c',
 'first_utm_source_c',
 'total_leads_droppped',
 'city_tier',
 'first_platform_c_Level8',
 'first_platform_c_others',
 'first_platform_c_Level2',
 'first_utm_medium_c_others',
 'first_utm_medium_c_Level13',
 'first_utm_medium_c_Level0',
 'first_platform_c_Level7',
 'first_platform_c_Level0']

In [65]:
get_models_prediction(db_path, db_file_name, model_name, model_stage, tracking_uri)

Data has been extracted successfully from lead_scoring_model_experimentation.
[1 0 0 ... 1 1 1]
input_data has been saved successfully to table predicted_output


'Predictions are done and save in Final_Predictions Table'

In [66]:
model_name='LightGBM'
model_stage='production'
model_uri = "models:/{model_name}/{model_stage}".format(model_name=model_name, model_stage=model_stage)
print(model_uri)

models:/LightGBM/production


In [73]:
prediction_ratio_check(db_path, db_file_name, scripts_output)

Data has been extracted successfully from lead_scoring_model_experimentation.
Output file has been generated successfully /home/Assignment/03_inference_pipeline/scripts/prediction_distribution_20241106115025.txt


In [74]:
input_features_check(db_path, db_file_name, one_hot_encoded_features)

Data has been extracted successfully from lead_scoring_model_experimentation.
Some of the models inputs are missing


In [75]:
cnx = sqlite3.connect(DB_PATH + DB_FILE_NAME)
df = pd.read_sql(f"SELECT * FROM features", cnx)
print(df.shape)
df.head(10)

(238139, 14)


Unnamed: 0,index,first_platform_c,first_utm_medium_c,first_utm_source_c,total_leads_droppped,city_tier,first_platform_c_Level8,first_platform_c_others,first_platform_c_Level2,first_utm_medium_c_others,first_utm_medium_c_Level13,first_utm_medium_c_Level0,first_platform_c_Level7,first_platform_c_Level0
0,0,0,0,0,1.0,1.0,0,0,0,0,0,0,0,1
1,1,0,0,0,1.0,2.0,0,0,0,0,0,1,0,0
2,2,0,0,0,1.0,1.0,0,0,0,0,0,1,0,0
3,3,0,0,0,2.0,1.0,0,0,0,0,0,0,0,0
4,4,0,0,0,1.0,1.0,0,0,0,0,0,1,0,0
5,5,0,0,0,2.0,1.0,0,0,0,0,0,0,0,1
6,6,0,0,0,3.0,1.0,0,0,0,0,0,1,0,0
7,7,0,0,0,2.0,1.0,0,0,0,0,0,1,0,0
8,8,0,0,0,1.0,1.0,0,0,0,0,0,0,0,1
9,9,0,0,0,1.0,2.0,0,0,0,1,0,0,0,0
