In [1]:
import pandas as pd
import numpy as np
import load_data
from sklearn.model_selection import train_test_split
from ipynb.fs.full.Get_Base_Data_00 import Time

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [2]:
# Pre-process individual columns

# Change dates to inbetween years
def dates_to_days(df, col_list, base_date = '2018-01-01'):
    #Convert date columns into datetime format
    df['base_dt'] = pd.to_datetime(base_date)
    df[col_list] = df[col_list].apply(pd.to_datetime)

    for col in col_list:
        df[col] = abs(df['base_dt'].dt.year - df[col].dt.year)

    #Drop columns (base_dt)
    df = df.drop('base_dt', axis=1)

    return df

# # Change Children Status to binary #Maybe just add NA as category
# def change_children_status(df, col_list, replace_list):
#     for col in col_list:
#         df[col].replace(replace_list, 'yes', inplace=True)
#         df[col].fillna('no', inplace=True)
#     return df
# children_list = ['customer_children', 'customer_children_y', 'customer_children_x']
# replace_list = ['mature', 'young', 'onebaby', 'adolescent', 'preschool', 'grownup']


# Categorize Area Code by 1000s
def bin_area_code(df):
    # Bin area codes by 1000s 
    labels = ["{}_area_code".format(i) for i in range(0, 10000, 1000)]
    df['area_cat'] = pd.cut(df['customer_postal_code_m1'], range(0, 10005, 1000), right=False, labels=labels)
    
    return df

# Get % Change from time point 1 to 2, 2 to 3, 1 to 3
# def get_percent_change(df, col_list):
def get_differences(df, col_list):

    def difference(col1,col2):
#     def percentage_change(col1,col2):
#         change = ((col2 - col1) / col1) * 100
        return col2-col1

    for col in col_list:
        df['{}_1'.format(col[2])] = difference(df[col[0]],df[col[1]]) 
        df['{}_2'.format(col[2])] = difference(df[col[1]],df[col[2]]) 
        df['{}_3'.format(col[2])] = difference(df[col[0]],df[col[2]]) 

        df['{}_1'.format(col[2])] = df['{}_1'.format(col[2])].fillna(0)
        df['{}_2'.format(col[2])] = df['{}_2'.format(col[2])].fillna(0)
        df['{}_3'.format(col[2])] = df['{}_3'.format(col[2])].fillna(0)    

    return df

    
# Change NA to category
def categorize_na(df, col_list):
    for col in col_list:
        df[col] = df[col].fillna("Unknown")

    return df


# Drop features
def drop_features(col_list):
    drop_features = col_list
    
    # Duplicated Columns to drop
    dup_cols = data_raw.T.duplicated().reset_index()
    dup_cols_list = dup_cols.loc[dup_cols[0], 'index'].tolist()
    drop_features.extend(dup_cols_list)

    data = data_raw.drop(drop_features, axis=1)
    
    print(f'Raw merged data: {data_raw.shape}')
    print(f'Duplicated columns dropped: {data.shape}')
    
    return data

In [3]:
def main():
    time = Time()
    time.print_start()
    
    #Load Data
    mypath = "../data/"
    mydata = load_data.get_file_names(mypath)
    data_files = load_data.load_copy_data(mydata, mypath)
    data_raw = data_files['data_merged'].copy()
    
    #Construct test, train set
    X = data_raw.drop('target',axis=1)
    y = data_raw['target']

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    
    ### Pre-process training data ###
    print(f'\nStart preproecssing training data.\n')
    
    #1. Change dates to inbetween years
    print(f'1. Change dates to number of years.')
    list_dates = ['customer_since_all_m1', 'customer_since_bank_m1', 'customer_birth_date_m1']
    X_train = dates_to_days(X_train, list_dates, base_date = '2018-01-01')

    #2. Categorize Area Code by 1000s
    print(f'2. Categorize area code by 1000s.')
    X_train = bin_area_code(X_train)

    #3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
    print(f'3. Get differences of balances between timepoints.')
    list_balances = [['bal_insurance_21_m1', 'bal_insurance_21_m2', 'bal_insurance_21'],
                     ['bal_insurance_23_m1', 'bal_insurance_23_m2', 'bal_insurance_23'],
                     ['bal_personal_loan_m1', 'bal_personal_loan_m2', 'bal_personal_loan'],
                     ['bal_mortgage_loan_m1', 'bal_mortgage_loan_m2', 'bal_mortgage_loan'],
                     ['bal_current_account_m1', 'bal_current_account_m2', 'bal_current_account'],
                     ['bal_pension_saving_m1', 'bal_pension_saving_m2', 'bal_pension_saving'],
                     ['bal_savings_account_m1', 'bal_savings_account_m2', 'bal_savings_account'],
                    ]

    X_train = get_differences(X_train, list_balances)


    #4. Change NA to category
    print(f'4. Change NA values to a categorical value, Unknown.')
    col_list = ['customer_education_m1', 
                'customer_children_m1', 'customer_children_m2', 'customer_children',
                'customer_relationship_m1', 'customer_relationship_m2', 'customer_relationship',
               ]

    X_train = categorize_na(X_train, col_list)

    # drop 'customer_postal_code_x'
    print(f'5. Drop customer_postal_code_m1.\n')
    X_train = X_train.drop('customer_postal_code_m1', axis=1)
    
    print(f'Finished preprocess of training data.\n')

    # Check if pre-process went as intended...
    #1. Change dates to inbetween years
    #X_train['customer_since_all_x'].unique() #Changed to years inbetween
    # X_train['customer_since_bank_x'].unique()
    # X_train['customer_birth_date_x'].unique()

    #2. Categorize Area Code by 1000s
    #X_train['area_cat'].unique()

    #3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
    # print(X_train['bal_insurance_21_1'][43567])
    # print(X_train['bal_insurance_21_2'][43567])
    # print(X_train['bal_insurance_21_3'][43567])

    #4. Change NA to category
    # X_train['customer_children_x'].unique()

    ### Pre-process validation data ###
    print(f'Start preproecssing validation data.\n')
    
    #1. Change dates to inbetween years
    print(f'1. Change dates to number of years.')
    X_val = dates_to_days(X_val, list_dates, base_date = '2018-01-01')

    #2. Categorize Area Code by 1000s
    print(f'2. Categorize area code by 1000s.')
    X_val = bin_area_code(X_val)

    #3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
    print(f'3. Get differences of balances between timepoints.')
    X_val = get_differences(X_val, list_balances)

    #4. Change NA to category
    print(f'4. Categorize area code by 1000s.')
    X_val = categorize_na(X_val, col_list)

    # drop 'customer_postal_code_x'
    print(f'5. Drop customer_postal_code_m1.\n')
    X_val = X_val.drop('customer_postal_code_m1', axis=1)
    
    print(f'Finished preprocess of training data.\n')
    
    # Export X_train, X_val, y_train, y_val
    print(f'Export data to {mypath} as X_train.csv, X_val.csv, y_train.csv, y_val.csv')
    X_train.to_csv(mypath + 'X_train.csv', encoding='utf-8',index=False)
    X_val.to_csv(mypath + 'X_val.csv', encoding='utf-8',index=False)
    y_train.to_csv(mypath + 'y_train.csv', encoding='utf-8',index=False)
    y_val.to_csv(mypath + 'y_val.csv', encoding='utf-8',index=False)

    time.print_end()

In [4]:
if __name__ == '__main__':
    main()

--------Start Script--------
--------Start Time: 2022-04-05 15:15:58-------

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: train_month_3_with_target

Start preproecssing training data.

1. Change dates to number of years.
2. Categorize area code by 1000s.
3. Get differences of balances between timepoints.
4. Change NA values to a categorical value, Unknown.
5. Drop customer_postal_code_m1.

Finished preprocess of training data.

Start preproecssing validation data.

1. Change dates to number of years.
2. Categorize area code by 1000s.
3. Get differences of balances between timepoints.
4. Categorize area code by 1000s.
5. Drop customer_postal_code_m1.

Finished preprocess of training data.

Export data to ../data/ as X_train.csv, X_val.csv, y_train.csv, y_val.csv
Total 86399 [sec]
-----End Time : 2022-04-05 15:16:02 ---------
-----END SCRIPT------
