In [21]:
import os
import sys
import pandas as pd
import numpy as np

import tensorflow as tf

In [22]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
LOGS_PATH = os.path.join(MODELS_PATH, 'logs')
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PROJECT_NAME = '2.1-ie-Linear-SVC-model'
MODEL_NAME = 'LinearSVC-v1.0'

In [23]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [24]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

In [25]:
# import internal function
from src.data import process_pipeline, encoder_pipeline, feature_selection_pipeline

In [26]:
df = pd.read_csv(TEST_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance,CallDuration
0,1,32,management,single,tertiary,0,1218,1,0,1,28,jan,2,-1,0,0,1900-01-01 13:45:20,1900-01-01 13:46:30,0,70.0
1,2,32,blue-collar,married,primary,0,1156,1,0,0,26,may,5,-1,0,0,1900-01-01 14:49:03,1900-01-01 14:52:08,0,185.0
2,3,29,management,single,tertiary,0,637,1,0,1,3,jun,1,119,1,0,1900-01-01 16:30:24,1900-01-01 16:36:04,1,340.0
3,4,25,student,single,primary,0,373,1,0,1,11,may,2,-1,0,0,1900-01-01 12:06:43,1900-01-01 12:20:22,1,819.0
4,5,30,management,married,tertiary,0,2694,0,0,1,3,jun,1,-1,0,0,1900-01-01 14:35:44,1900-01-01 14:38:56,0,192.0


In [27]:
df['CallStart'] = pd.to_numeric(df['CallStart'])
df['CallEnd'] = pd.to_numeric(df['CallEnd'])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Id                4000 non-null   int64  
 1   Age               4000 non-null   int64  
 2   Job               4000 non-null   object 
 3   Marital           4000 non-null   object 
 4   Education         4000 non-null   object 
 5   Default           4000 non-null   int64  
 6   Balance           4000 non-null   int64  
 7   HHInsurance       4000 non-null   int64  
 8   CarLoan           4000 non-null   int64  
 9   Communication     4000 non-null   int64  
 10  LastContactDay    4000 non-null   int64  
 11  LastContactMonth  4000 non-null   object 
 12  NoOfContacts      4000 non-null   int64  
 13  DaysPassed        4000 non-null   int64  
 14  PrevAttempts      4000 non-null   int64  
 15  Outcome           4000 non-null   int64  
 16  CallStart         4000 non-null   int64  


In [30]:
df_slices = tf.data.Dataset.from_tensor_slices(dict(df))

for feature_batch in df_slices.take(1):
    for key, value in feature_batch.items():
        print(f'{key} : {value}')

Id : 1
Age : 32
Job : b'management'
Marital : b'single'
Education : b'tertiary'
Default : 0
Balance : 1218
HHInsurance : 1
CarLoan : 0
Communication : 1
LastContactDay : 28
LastContactMonth : b'jan'
NoOfContacts : 2
DaysPassed : -1
PrevAttempts : 0
Outcome : 0
CallStart : -2208939280000000000
CallEnd : -2208939210000000000
CarInsurance : 0
CallDuration : 70.0


In [35]:
tf.data.Dataset.from_tensor_slices(dict(df))

<TensorSliceDataset element_spec={'Id': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Age': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Job': TensorSpec(shape=(), dtype=tf.string, name=None), 'Marital': TensorSpec(shape=(), dtype=tf.string, name=None), 'Education': TensorSpec(shape=(), dtype=tf.string, name=None), 'Default': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Balance': TensorSpec(shape=(), dtype=tf.int64, name=None), 'HHInsurance': TensorSpec(shape=(), dtype=tf.int64, name=None), 'CarLoan': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Communication': TensorSpec(shape=(), dtype=tf.int64, name=None), 'LastContactDay': TensorSpec(shape=(), dtype=tf.int64, name=None), 'LastContactMonth': TensorSpec(shape=(), dtype=tf.string, name=None), 'NoOfContacts': TensorSpec(shape=(), dtype=tf.int64, name=None), 'DaysPassed': TensorSpec(shape=(), dtype=tf.int64, name=None), 'PrevAttempts': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Outcome': TensorSpec(shape=(), dt