In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import ml_utils as mt

In [None]:
ld_train=pd.read_csv("./loan_data_train.csv")

In [None]:
#Explore individual columns in the data to figure out rationale behind the decisions taken here 
# [except the ones present in ignore group]

# Ignore : 'Interest.Rate', 'Amount.Funded.By.Investors','ID'

# Amount.Requested : convert to numeric , impute missing values 
# Open.CREDIT.Lines: convert to numeric , impute missing values 
# Revolving.CREDIT.Balance : convert to numeric , impute missing values 


# Loan.Length : impute missing , create dummies 
# Loan.Purpose : impute missing , create dummies 
# Home.Ownership : impute missing , create dummies 
# State : impute missing , create dummies 

# Debt.To.Income.Ratio : custom func , impute missing 
# FICO.Range : custom func , impute missing 
# Employment.Length : custom func , impute missing 

# Monthly.Income : impute missing 
# Inquiries.in.the.Last.6.Months : impute missing 

## Custom Function for columns containing numeric information but require string process to extract that infomation

In [None]:
ld_train['Debt.To.Income.Ratio'].head()

In [None]:
def dtr(orig_col):
    
    mod_col=orig_col.str.replace('%','')
    mod_col=pd.to_numeric(mod_col,errors='coerce')
    
    return mod_col
    

In [None]:
ld_train['FICO.Range'].value_counts(dropna=False)

In [None]:
def fico(orig_col):
    k=orig_col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
    
    mod_col=0.5*(k[0]+k[1])
    
    return mod_col
    

In [None]:
ld_train['Employment.Length'].value_counts(dropna=False)

In [None]:
def el(orig_col):
    
    inter_col=orig_col.str.replace('10+ years','10',regex=False)
    inter_col=inter_col.str.replace('< 1 year','0',regex=False)
    inter_col=inter_col.str.replace('years','').str.replace('year','')
    
    mod_col=pd.to_numeric(inter_col,errors='coerce')
    
    return mod_col


All you need to do to use `DataPipe` class in `ml_utils` is to group these columns names in a list and pass them to appropriate arguments to the class . Rest is all handled internally. Do go through source code for `ml_utils` if you want to make changes or experiment with writing your own versions of these steps or if you need to add something more .

Note that equivalent classes do exist in `sklearn` as well, `ml_utils` simply exists as an exercise in writing custom classes and bring in customizability wherever its not available in `sklearn` implementations.

In [None]:
cat_to_dummies_cols=['Loan.Length','Loan.Purpose','State','Home.Ownership']
# groups of columns for which we need to create dummies 
cat_to_num_cols=['Amount.Requested','Open.CREDIT.Lines','Revolving.CREDIT.Balance']
# columns which contain numeric info but only require simple typecasting
simple_num_cols=['Monthly.Income','Inquiries.in.the.Last.6.Months']
# columns which are all ready numeric and contain numeric information
custom_func_dict_cols={'Debt.To.Income.Ratio':dtr,'FICO.Range':fico,'Employment.Length':el}
# columns which require custom processing to extract numeric information from the data

In [None]:
ld_pipe=mt.DataPipe(cat_to_dummies=cat_to_dummies_cols,
                 cat_to_num=cat_to_num_cols,
                 simple_num=simple_num_cols,
                 custom_func_dict=custom_func_dict_cols)

In [None]:
ld_pipe.fit(ld_train)

In [None]:
x_train=ld_pipe.transform(ld_train)

In [None]:
ld_test=pd.read_csv("/Users/lalitsachan/Library/CloudStorage/Dropbox/0.0 Data/loan_data_test.csv")

In [None]:
x_test=ld_pipe.transform(ld_test)

In [None]:
x_test.shape

In [None]:
x_train.shape

In [None]:
x_test

Exercise : creating data processing pipeline using ml_utils for the dataset insurance.csv

# Example 2

this file is too big to upload on github , you can download it from here : [download consumper complaint csv from here](https://www.dropbox.com/scl/fi/tj1ynemyyld6g8ojos5jj/Consumer_Complaints_train.csv?rlkey=q717ubsel1worxuqrf07zfe6b&dl=1)

In [None]:
cd_train=pd.read_csv(r'./Consumer_Complaints_train.csv')

In [None]:
cd_train.head()

In [None]:
cd_train.columns

In [None]:
cd_train.nunique()

In [None]:
date_diff_cols=['Date received','Date sent to company']
date_component_cols=['Date received','Date sent to company']
text_feat_cols=['Consumer complaint narrative']
cat_to_dummy_cols=['Product', 'Sub-product', 'Issue', 'Sub-issue','Company public response', 'Company',
       'State', 'ZIP code', 'Tags', 'Consumer consent provided?','Submitted via','Company response to consumer',
       'Timely response?']

In [None]:
cd_data_pipe=mt.DataPipe(cat_to_dummies=cat_to_dummy_cols,
                         date_components=date_component_cols,
                         text_feat=text_feat_cols,
                         date_diffs=date_diff_cols)

In [None]:
# its a sizable dataset and we are also creating tons of features , this might take awhile [both fit and transform.
# it took my machine 10 mins for each]
# if your system gives out of memory error , try uploading to google colab, you will have to 
# upload your data file also to colab, it wont be able to access data file from your local machine

# do go through ml_utils text feature class , it has some defaults you might want to change
cd_data_pipe.fit(cd_train)

In [None]:
x_train=cd_data_pipe.transform(cd_train)

In [None]:
x_train.shape

# Saving python objects to disk [you should save the ones especially which take massive time to create]
https://www.datacamp.com/tutorial/pickle-python-tutorial

# Example 3

In [None]:
bd_train=pd.read_csv(r'./bd_train.csv')
bd_test=pd.read_csv(r'./bd_test.csv')

In [None]:
bd_train.head()

In [None]:
# REF_NO : ignore, post_code, post_area, Revenue.Grid [Target]
# post_code and post_area have been ignored because none of the categories had sufficient observations 
# numbers in them

# children : custom function
# age_band : custom function 
# family_income : custom function

# create dummies : 'status' , 'occupation' , 'occupation_partner' , 'home_status', 'self_employed',
# 'self_employed_partner','TVarea','gender','region'

# simple_numeric : 'year_last_moved','Average.Credit.Card.Transaction', 'Balance.Transfer',
#       'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
#       'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
#       'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount','Investment.in.Commudity',
#       'Investment.in.Equity', 'Investment.in.Derivative',
#       'Portfolio.Balance'



In [None]:
def children_to_num(col):
    
    num_col=col.str.replace('Zero','0')
    num_col=num_col.str.replace('4+','4',regex=False)
    num_col=pd.to_numeric(num_col,errors='coerce')
    
    return num_col

In [None]:
def ab_to_num(col):
    
    col=col.str.replace('71+','71-71',regex=False)
    k=col.str.split('-',expand=True)
    
    for i in [0,1]:
        k[i]=pd.to_numeric(k[i],errors='coerce')
        
    num_col=0.5*(k[0]+k[1])
    
    return num_col

In [None]:
def fi_to_num(col):
    
    col=col.replace({'<10,000, >= 8,000':9000, '>=35,000':35000, '<25,000, >=22,500':23750,
       '<20,000, >=17,500':18750, '<12,500, >=10,000':11250, '<30,000, >=27,500':28750,
       '<27,500, >=25,000':26250, '<17,500, >=15,000':16250, '<15,000, >=12,500':13750,
       '<22,500, >=20,000':21250,'< 4,000': 4000, '< 8,000, >= 4,000':6000})
    num_col=pd.to_numeric(col,errors='coerce')
    
    return num_col

In [None]:
simple_numeric_cols=['year_last_moved','Average.Credit.Card.Transaction', 'Balance.Transfer',
      'Term.Deposit', 'Life.Insurance', 'Medical.Insurance',
      'Average.A.C.Balance', 'Personal.Loan', 'Investment.in.Mutual.Fund',
      'Investment.Tax.Saving.Bond', 'Home.Loan', 'Online.Purchase.Amount','Investment.in.Commudity',
      'Investment.in.Equity', 'Investment.in.Derivative',
      'Portfolio.Balance']

cat_to_dummies_cols=['status' , 'occupation' , 'occupation_partner' , 'home_status', 'self_employed',
'self_employed_partner','TVarea','gender','region']

custom_function_cols={'children':children_to_num,'age_band':ab_to_num,'family_income':fi_to_num}

In [None]:
data_pipe=mt.DataPipe(simple_num=simple_numeric_cols,
                     cat_to_dummies=cat_to_dummies_cols,
                     custom_func_dict=custom_function_cols)

In [None]:
data_pipe.fit(bd_train)

In [None]:
x_train=data_pipe.transform(bd_train)
x_test=data_pipe.transform(bd_test)