In [1]:
import pandas as pd
import numpy as np
import load_data
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [2]:
#Load Data
mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

data = data_files['data_merged'].copy()

file name: X_train_des
file name: y_test_des
file name: X_test_des
file name: train_month_1
file name: data_merged
file name: train_month_2
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: y_train_des
file name: train_month_3_with_target


In [3]:
#Construct test, train set
X = data.drop('target',axis=1)
y = data['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
# Pre-process individual columns

# Change dates to inbetween years
def dates_to_days(df, col_list, base_date = '2018-01-01'):
    #Convert date columns into datetime format
    df['base_dt'] = pd.to_datetime(base_date)
    df[list_dates] = df[list_dates].apply(pd.to_datetime)

    for col in list_dates:
        df[col] = abs(df['base_dt'].dt.year - df[col].dt.year)

    #Drop columns (base_dt)
    df = df.drop('base_dt', axis=1)

    return df

# # Change Children Status to binary #Maybe just add NA as category
# def change_children_status(df, col_list, replace_list):
#     for col in col_list:
#         df[col].replace(replace_list, 'yes', inplace=True)
#         df[col].fillna('no', inplace=True)
#     return df
# children_list = ['customer_children', 'customer_children_y', 'customer_children_x']
# replace_list = ['mature', 'young', 'onebaby', 'adolescent', 'preschool', 'grownup']


# Categorize Area Code by 1000s
def bin_area_code(df):
    # Bin area codes by 1000s 
    labels = ["{}_area_code".format(i) for i in range(0, 10000, 1000)]
    df['area_cat'] = pd.cut(df['customer_postal_code_x'], range(0, 10005, 1000), right=False, labels=labels)
    
    return df

# Get % Change from time point 1 to 2, 2 to 3, 1 to 3
def get_percent_change(df, col_list):

    def percentage_change(col1,col2):
        return ((col2 - col1) / col1) * 100

    for col in col_list:
        df['{}_1'.format(col[2])] = percentage_change(df[col[0]],df[col[1]]) 
        df['{}_2'.format(col[2])] = percentage_change(df[col[1]],df[col[2]]) 
        df['{}_3'.format(col[2])] = percentage_change(df[col[0]],df[col[2]]) 

        df['{}_1'.format(col[2])] = df['{}_1'.format(col[2])].fillna(0)
        df['{}_2'.format(col[2])] = df['{}_2'.format(col[2])].fillna(0)
        df['{}_3'.format(col[2])] = df['{}_3'.format(col[2])].fillna(0)    

    return df

    
# Change NA to category
def categorize_na(df, col_list):
    for col in col_list:
        df[col] = df[col].fillna("Unknown")

    return df
    

In [5]:
### Pre-process training data ###

#1. Change dates to inbetween years
list_dates = ['customer_since_all_x', 'customer_since_bank_x', 'customer_birth_date_x']
X_train = dates_to_days(X_train, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
X_train = bin_area_code(X_train)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
list_balances = [['bal_insurance_21_x', 'bal_insurance_21_y', 'bal_insurance_21'],
                 ['bal_insurance_23_x', 'bal_insurance_23_y', 'bal_insurance_23'],
                 ['bal_personal_loan_x', 'bal_personal_loan_y', 'bal_personal_loan'],
                 ['bal_mortgage_loan_x', 'bal_mortgage_loan_y', 'bal_mortgage_loan'],
                 ['bal_current_account_x', 'bal_current_account_y', 'bal_current_account'],
                 ['bal_pension_saving_x', 'bal_pension_saving_y', 'bal_pension_saving'],
                 ['bal_savings_account_x', 'bal_savings_account_y', 'bal_savings_account'],
                ]

X_train = get_percent_change(X_train, list_balances)


#4. Change NA to category
col_list = ['customer_education_x', 
            'customer_children_x', 'customer_children_y', 'customer_children',
            'customer_relationship_x', 'customer_relationship_y', 'customer_relationship',
           ]

X_train = categorize_na(X_train, col_list)

In [6]:
# Check if pre-process went as intended...

#1. Change dates to inbetween years
#X_train['customer_since_all_x'].unique() #Changed to years inbetween
# X_train['customer_since_bank_x'].unique()
# X_train['customer_birth_date_x'].unique()

#2. Categorize Area Code by 1000s
# X_train['area_cat'].unique()

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
# print(X_train['bal_insurance_21_1'][43567])
# print(X_train['bal_insurance_21_2'][43567])
# print(X_train['bal_insurance_21_3'][43567])

#4. Change NA to category
X_train['customer_children_x'].unique()

array(['no', 'mature', 'grownup', 'Unknown', 'adolescent', 'preschool',
       'young', 'onebaby', 'yes'], dtype=object)

In [7]:
### Pre-process validation data ###

#1. Change dates to inbetween years
X_val = dates_to_days(X_val, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
X_val = bin_area_code(X_val)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
X_val = get_percent_change(X_val, list_balances)

#4. Change NA to category
X_val = categorize_na(X_val, col_list)

In [8]:
# Export X_train, X_val, y_train, y_val

X_train.to_csv('../data/' + 'X_train.csv', encoding='utf-8')
X_val.to_csv('../data/' + 'X_val.csv', encoding='utf-8')
y_train.to_csv('../data/' + 'y_train.csv', encoding='utf-8')
y_val.to_csv('../data/' + 'y_val.csv', encoding='utf-8')