In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder, OneHotEncoder

from xgboost import XGBRFClassifier

In [2]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
PIPELINE_PATH = os.path.join(MODELS_PATH, 'pipelines')
LOGS_PATH = os.path.join(MODELS_PATH, 'logs')
TRAIN_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PIPELINE_NAME = 'data-pipeline-v1.0'

In [3]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [4]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

# Function to save a trained model
def save_model(model, model_name, folderPath):
    filename = os.path.join(folderPath, f"{model_name}.pkl")
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

In [5]:
# import internal function
from src.data import process_pipeline

In [6]:
df = pd.read_csv(TRAIN_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,...,Outcome,CallStart,CallEnd,CarInsurance,HasCommuncation,SinLastContactMonth,CosLastContactMonth,CallDuration,CallCategory,Outcome_Simplify
0,1,32,management,single,tertiary,0,1218,1,0,telephone,...,failure,49520,49590,0,1,0.5,0.866025,70.0,Afternoon,0
1,2,32,blue-collar,married,primary,0,1156,1,0,No Communication,...,failure,53343,53528,0,0,0.5,-0.866025,185.0,Afternoon,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,...,failure,59424,59764,1,1,1.224647e-16,-1.0,340.0,Afternoon,0
3,4,25,student,single,primary,0,373,1,0,cellular,...,failure,43603,44422,1,1,0.5,-0.866025,819.0,Afternoon,0
4,5,30,management,married,tertiary,0,2694,0,0,cellular,...,failure,52544,52736,0,1,1.224647e-16,-1.0,192.0,Afternoon,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Id                   4000 non-null   int64   
 1   Age                  4000 non-null   int64   
 2   Job                  4000 non-null   category
 3   Marital              4000 non-null   category
 4   Education            4000 non-null   category
 5   Default              4000 non-null   int64   
 6   Balance              4000 non-null   int64   
 7   HHInsurance          4000 non-null   int64   
 8   CarLoan              4000 non-null   int64   
 9   Communication        4000 non-null   category
 10  LastContactDay       4000 non-null   int64   
 11  LastContactMonth     4000 non-null   int64   
 12  NoOfContacts         4000 non-null   int64   
 13  DaysPassed           4000 non-null   int64   
 14  PrevAttempts         4000 non-null   int64   
 15  Outcome              

In [8]:
X = df.drop(columns=['CarInsurance'])
y = df['CarInsurance']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
# make transformer
ct = make_column_transformer(
    (SimpleImputer(missing_values=np.nan, strategy='mean'), make_column_selector(dtype_include=np.number)),
    (OrdinalEncoder(), make_column_selector(dtype_include='category'))
)

pipe_ct = make_pipeline(
    ct,
    StandardScaler()
)

# --- Data Pipeline Checkpoint --- #
save_model(pipe_ct.fit(X_train), PIPELINE_NAME, PIPELINE_PATH)

# create the data
columns = X.columns.to_list()
X_train = ct.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=columns)

clf = XGBRFClassifier().fit(X_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):


In [10]:
selector = SelectFromModel(clf, prefit=True)
selector.transform(X_train)



array([[  0.,   1., 716.,   0.,   1.,   1.],
       [  1.,   1., 220.,   0.,   1.,   0.],
       [  1.,   1., 194.,   0.,   1.,   0.],
       ...,
       [  0.,   1.,  40.,   0.,   1.,   0.],
       [  1.,   1.,  77.,   0.,   1.,   0.],
       [  0.,   1., 275.,   0.,   1.,   0.]])

In [11]:
feature_names = np.array(X.columns.to_list())
feature_names[selector.get_support()]

array(['Education', 'DaysPassed', 'CallStart', 'CallEnd', 'CallDuration',
       'CallCategory'], dtype='<U19')