In [30]:
import os
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder

In [31]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
TRAIN_DATA_PATH = os.path.join(PROJECT_DIR,
                               'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PROJECT_NAME = '2.1-ie-Linear-SVC-model'
MODEL_NAME = 'LinearSVC-v1.0'

In [32]:
df = pd.read_csv(TRAIN_DATA_PATH)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance
0,1,32,management,single,tertiary,0,1218,1,0,telephone,28,jan,2,-1,0,,13:45:20,13:46:30,0
1,2,32,blue-collar,married,primary,0,1156,1,0,,26,may,5,-1,0,,14:49:03,14:52:08,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,3,jun,1,119,1,failure,16:30:24,16:36:04,1
3,4,25,student,single,primary,0,373,1,0,cellular,11,may,2,-1,0,,12:06:43,12:20:22,1
4,5,30,management,married,tertiary,0,2694,0,0,cellular,3,jun,1,-1,0,,14:35:44,14:38:56,0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4000 non-null   int64 
 1   Age               4000 non-null   int64 
 2   Job               3981 non-null   object
 3   Marital           4000 non-null   object
 4   Education         3831 non-null   object
 5   Default           4000 non-null   int64 
 6   Balance           4000 non-null   int64 
 7   HHInsurance       4000 non-null   int64 
 8   CarLoan           4000 non-null   int64 
 9   Communication     3098 non-null   object
 10  LastContactDay    4000 non-null   int64 
 11  LastContactMonth  4000 non-null   object
 12  NoOfContacts      4000 non-null   int64 
 13  DaysPassed        4000 non-null   int64 
 14  PrevAttempts      4000 non-null   int64 
 15  Outcome           958 non-null    object
 16  CallStart         4000 non-null   object
 17  CallEnd       

In [34]:
def feature_contact(data):
    data['CallStart'] = pd.to_datetime(data['CallStart'], format='%H:%M:%S')
    data['CallEnd'] = pd.to_datetime(data['CallEnd'], format='%H:%M:%S')

    # Calculate call duration and create a new column 'CallDuration'
    data['CallDuration'] = data['CallEnd'] - data['CallStart']
    data['CallDuration'] = data['CallDuration'].dt.total_seconds()
    return data

def feature_education(data):
    data['Education'] = data['Education'].fillna('No Education')
    return data

In [42]:
# Numeric feature engineering
numeric_engineer = make_pipeline(
    # contact
    FunctionTransformer(feature_contact, validate=False)
)

# Categorical feature engineering
categorical_engineer = make_pipeline(
    # change to category dataframe
    FunctionTransformer(feature_education, validate=False)
)

# combine pipeline
featureEngineer = ColumnTransformer(
    transformers=[
        ("num", numeric_engineer, ['CallStart', 'CallEnd']),
        ("cat", categorical_engineer, ['Education'])
    ]
)

In [44]:
dft = featureEngineer.fit_transform(df)

In [46]:
pd.DataFrame(dft)

Unnamed: 0,0,1,2,3
0,1900-01-01 13:45:20,1900-01-01 13:46:30,70.0,tertiary
1,1900-01-01 14:49:03,1900-01-01 14:52:08,185.0,primary
2,1900-01-01 16:30:24,1900-01-01 16:36:04,340.0,tertiary
3,1900-01-01 12:06:43,1900-01-01 12:20:22,819.0,primary
4,1900-01-01 14:35:44,1900-01-01 14:38:56,192.0,tertiary
...,...,...,...,...
3995,1900-01-01 17:46:28,1900-01-01 17:50:57,269.0,tertiary
3996,1900-01-01 14:49:16,1900-01-01 14:51:21,125.0,secondary
3997,1900-01-01 12:19:03,1900-01-01 12:23:53,290.0,secondary
3998,1900-01-01 11:27:35,1900-01-01 11:29:14,99.0,tertiary


In [36]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

# Get list of categorical & continous variable
categ = get_content(CATEG_PATH)
conti = get_content(CONTI_PATH)

In [37]:
def to_categorical(df):
    if isinstance(df, pd.DataFrame):
        for col in df:
            df[col] = df[col].astype("category")
    else:
        df = df.astype("category")

    return df

def to_numeric(df):
    if isinstance(df, pd.DataFrame):
        for col in df:
            df[col] = df[col].astype("int")
    else:
        df = df.astype("int")

    return df

In [38]:
# Numeric feature processing
numeric_transformer = make_pipeline(
    FunctionTransformer(to_numeric, validate=False)
)

# Categorical feature processing
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = make_pipeline(
    # change to category dataframe
    FunctionTransformer(to_categorical, validate=False)
)

# combine pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, conti),
        ("cat", categorical_transformer, categ)
    ]
)

In [39]:
pipelineData = make_pipeline(
    featureEngineer, preprocessor
)

In [40]:
X_trans = pipelineData.fit_transform(df)

ValueError: A given column is not a column of the dataframe