In [1]:
import pandas as pd
import numpy as np
import load_data
from sklearn.model_selection import train_test_split
from ipynb.fs.full.Get_Base_Data_00 import Time
from datetime import date

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

# Load Data

In [2]:
#Load Data
mypath = "../data/team_data/data/"
mydata = load_data.get_file_names(mypath)
df = load_data.load_copy_data(mydata, mypath)

# Dataset with Spatial Columns
mig_train = df['df_full_train'].copy()
mig_val = df['df_full_test'].copy() #raw data is called test but its validation

file name: test_full_R
file name: X_train_RAW
file name: X_train_R
file name: y_val_R
file name: train_full_R
file name: y_val_RAW
file name: train_month_1
file name: train_month_2
file name: X_val_R
file name: df_full_train
file name: X_val_RAW
file name: y_train_R
file name: df_full_test
file name: test_month_1
file name: test_month_3
file name: test_month_2
file name: y_train_RAW
file name: train_month_3_with_target


# Merge Raw Data

In [3]:
def merge_df(left_df, right_df, merge_on, how, suffixes=(None,None)):
    df_merge = left_df.merge(right_df, on=[merge_on], how=how, suffixes=suffixes)
    print(f"Shape of dataframe: {df_merge.shape}")
    return df_merge

In [4]:
#Merge data with client_id as key

#Merge to make training data/validation data
df_merged = merge_df(df['train_month_1'], df['train_month_2'], 'client_id', 'outer', ('_m1', '_m2'))
df_merged = merge_df(df_merged, df['train_month_3_with_target'], 'client_id', 'outer')

#Merge to make test data
df_test_merged = merge_df(df['test_month_1'], df['test_month_2'], 'client_id', 'outer', ('_m1', '_m2'))
df_test_merged = merge_df(df_test_merged, df['test_month_3'], 'client_id', 'outer')

Shape of dataframe: (63697, 77)
Shape of dataframe: (63697, 116)
Shape of dataframe: (27300, 77)
Shape of dataframe: (27300, 115)


# Drop Features

In [5]:
# THE ONLY ADJUSTEMENTS THAT I MADE (JULIE)
# SO COPY THIS CELL TO YOUR NOTEBOOK AND PASTE IT RIGHT AFTER MERGING THE 3 DATASETS

#Make new feature: customer_commitment
customer_commitment= ['customer_occupation_code','customer_children', 'customer_relationship']

#apply to train/validation dataset
df_merged['customer_commitment'] = df_merged[customer_commitment].apply(lambda x: x.count(), axis=1)

#apply to test dataset
df_test_merged['customer_commitment'] = df_test_merged[customer_commitment].apply(lambda x: x.count(), axis=1)

#Drop features that are too highly correlate
drop_features_m1 = ['homebanking_active_m1', 'has_homebanking_m1',
       'has_insurance_21_m1', 'has_insurance_23_m1',
       'has_life_insurance_fixed_cap_m1',
       'has_life_insurance_decreasing_cap_m1',
       'has_fire_car_other_insurance_m1', 'has_personal_loan_m1',
       'has_mortgage_loan_m1', 'has_current_account_m1',
       'has_pension_saving_m1', 'has_savings_account_m1',
       'has_savings_account_starter_m1', 'has_current_account_starter_m1',
       'customer_gender_m1', 
       'customer_occupation_code_m1',
       'customer_self_employed_m1', 
       'customer_children_m1', 'customer_relationship_m1']

# will be used later so dont remove: 
#'customer_education_m1','customer_postal_code_m1','customer_birth_date_m1',
#'customer_since_all_m1','customer_since_bank_m1',

drop_features_m2 = ['homebanking_active_m2', 'has_homebanking_m2',
       'has_insurance_21_m2', 'has_insurance_23_m2',
       'has_life_insurance_fixed_cap_m2',
       'has_life_insurance_decreasing_cap_m2',
       'has_fire_car_other_insurance_m2', 'has_personal_loan_m2',
       'has_mortgage_loan_m2', 'has_current_account_m2',
       'has_pension_saving_m2', 'has_savings_account_m2',
       'has_savings_account_starter_m2', 'has_current_account_starter_m2',
        'customer_since_all_m2', 'customer_since_bank_m2',
       'customer_gender_m2', 'customer_birth_date_m2',
       'customer_postal_code_m2', 'customer_occupation_code_m2',
       'customer_self_employed_m2', 'customer_education_m2',
       'customer_children_m2', 'customer_relationship_m2']

#following features are already used (m1)
drop_features_m3 = ['has_homebanking','customer_education','customer_postal_code','customer_birth_date',
                   'customer_since_all', 'customer_since_bank']

#apply to train/validation dataset
df_merged = df_merged.drop(drop_features_m1, axis=1)
df_merged = df_merged.drop(drop_features_m2, axis=1)
df_merged = df_merged.drop(drop_features_m3, axis=1)  

#apply to test dataset
df_test_merged = df_test_merged.drop(drop_features_m1, axis=1)
df_test_merged = df_test_merged.drop(drop_features_m2, axis=1)
df_test_merged = df_test_merged.drop(drop_features_m3, axis=1)  

In [6]:
#Find and Drop duplicated features
def drop_dup_features(col_list, df):
    drop_features = col_list
    
    # Duplicated Columns to drop
    dup_cols = df.T.duplicated().reset_index()
    dup_cols_list = dup_cols.loc[dup_cols[0], 'index'].tolist()
    drop_features.extend(dup_cols_list)

    data = df.drop(drop_features, axis=1)
    
    print(f'Dropped features: {drop_features}')
    print(f'Raw merged data: {df.shape}')
    print(f'Duplicated columns dropped: {data.shape}')
    
    return data

In [7]:
#Drop duplicated features and customer education since it is missing 73% of data

#apply to train/validation dataset
df_merged = drop_dup_features(['customer_education_m1'], df_merged)

#apply to test dataset
df_test_merged = drop_dup_features(['customer_education_m1'], df_test_merged)

Dropped features: ['customer_education_m1']
Raw merged data: (63697, 68)
Duplicated columns dropped: (63697, 67)
Dropped features: ['customer_education_m1']
Raw merged data: (27300, 67)
Duplicated columns dropped: (27300, 66)


# Construct Train, Validation Dataset

In [8]:
#Construct train set, validation set
train, val = train_test_split(df_merged, test_size=0.2, random_state=0)

# Define Preprocess Steps

In [9]:
# Pre-process individual columns

# # Change dates to inbetween years
# def dates_to_days(df, col_list, base_date = '2018-01-01'):
#     #Convert date columns into datetime format
#     df['base_dt'] = pd.to_datetime(base_date)
#     df[col_list] = df[col_list].apply(pd.to_datetime)

#     for col in col_list:
#         df[col] = abs(df['base_dt'].dt.year - df[col].dt.year)

#     #Drop columns (base_dt)
#     df = df.drop('base_dt', axis=1)

#     return df
def age(birthdate):
    # Get today's date object
#     today = date.today()
    today = pd.to_datetime('2018-01-01')
    
    # A bool that represents if today's day/month precedes the birth day/month
    one_or_zero = ((today.month, today.day) < (birthdate.month, birthdate.day))
    
    # Calculate the difference in years from the date object's components
    year_difference = today.year - birthdate.year
    age = year_difference - one_or_zero
    
    return age

# Change dates to inbetween years
def dates_to_years(df, col_list, base_date = '2018-01-01'):
    #Convert date columns into datetime format
    df['base_dt'] = pd.to_datetime(base_date)
    df[col_list] = df[col_list].apply(pd.to_datetime)

    for col in col_list:
        df[col] = df[col].apply(lambda x : age(x))
        
    #Drop columns (base_dt)
    df = df.drop('base_dt', axis=1)

    return df

# Categorize Area Code by 1000s
def bin_area_code(df):
    # Bin area codes by 1000s 
    labels = ["{}_area_code".format(i) for i in range(0, 10000, 1000)]
    df['area_cat'] = pd.cut(df['customer_postal_code_m1'], range(0, 10005, 1000), right=False, labels=labels)
    
    return df

# Get % Change from time point 1 to 2, 2 to 3, 1 to 3
def get_differences(df, col_list):

    def difference(col1,col2):
        return col2-col1

    for col in col_list:
        df['{}_1'.format(col[2])] = difference(df[col[0]],df[col[1]]) 
        df['{}_2'.format(col[2])] = difference(df[col[1]],df[col[2]]) 
        df['{}_3'.format(col[2])] = difference(df[col[0]],df[col[2]]) 

        df['{}_1'.format(col[2])] = df['{}_1'.format(col[2])].fillna(0)
        df['{}_2'.format(col[2])] = df['{}_2'.format(col[2])].fillna(0)
        df['{}_3'.format(col[2])] = df['{}_3'.format(col[2])].fillna(0)    

    return df

# Bin Age
def bin_age(df, colname, upperLim):
    
    # Get all data below upper age limit : 100 years old in our case
    df_mask = df[colname]  < upperLim
    filtered_df = df[df_mask]
    
    # since lower bound is not included even when i set include_lowest=True, I will lower the lowerbound
    binInterval = pd.IntervalIndex.from_tuples([(17,20), (20,26), (26,52), (52,73), (73,99)])
    filtered_df['age_cat'] = pd.cut(filtered_df[colname], bins = binInterval, include_lowest=True)
    filtered_df['age_cat'] = filtered_df['age_cat'].astype(str)
    
    return filtered_df# Define Preprocess Steps


# Impute missing values
def impute_col(df, dtype, collist):
    # numeric columns replace NAs with mean
    if dtype == "numeric":
        for col in collist:
            df[col] = df[col].fillna(df[col].mean())
    # categorical columns replace NAs with mode
    elif dtype == "categorical":
        for col in collist:
            df[col] = df[col].fillna(df[col].mode()[0])
    else:
        print("Please define dtype as numeric or categorical")
    
    return df

# Preprocess Training Data

In [10]:
### Pre-process training data ###
print(f'\nStart preproecssing training data.\n')

#1. Change dates to inbetween years
print(f'1. Change dates to number of years.')
list_dates = ['customer_since_all_m1', 'customer_since_bank_m1', 'customer_birth_date_m1']
train = dates_to_years(train, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
print(f'2. Categorize area code by 1000s.')
train = bin_area_code(train)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
print(f'3. Get differences of balances between timepoints.')
list_balances = [['bal_insurance_21_m1', 'bal_insurance_21_m2', 'bal_insurance_21'],
                 ['bal_insurance_23_m1', 'bal_insurance_23_m2', 'bal_insurance_23'],
                 ['bal_personal_loan_m1', 'bal_personal_loan_m2', 'bal_personal_loan'],
                 ['bal_mortgage_loan_m1', 'bal_mortgage_loan_m2', 'bal_mortgage_loan'],
                 ['bal_current_account_m1', 'bal_current_account_m2', 'bal_current_account'],
                 ['bal_pension_saving_m1', 'bal_pension_saving_m2', 'bal_pension_saving'],
                 ['bal_savings_account_m1', 'bal_savings_account_m2', 'bal_savings_account'],
                ]

train = get_differences(train, list_balances)

#4. Add Spatial column (taken from miguel's dataset)
print(f'4. Add Spatial Column.\n')
train["Spatialclusters"] = mig_train["Spatialclusters"]

#5. Bin Ages
print(f'5. Bin Ages.\n') #here some rows drop so do everything impt above
train = bin_age(train, 'customer_birth_date_m1', 100)

#6. Fill in missing values
print(f'6. Impute Columns.\n')
# X_train = impute_col(X_train, 'numeric', collist) # no columns to be imputed
imputelist = ['customer_since_all_m1', 'customer_since_bank_m1',
           'customer_occupation_code', 'customer_children',
           'customer_relationship']
train = impute_col(train, 'categorical', imputelist)

print(f'Finished preprocess of training data.\n')



Start preproecssing training data.

1. Change dates to number of years.
2. Categorize area code by 1000s.
3. Get differences of balances between timepoints.
4. Add Spatial Column.

5. Bin Ages.

6. Impute Columns.

Finished preprocess of training data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['age_cat'] = pd.cut(filtered_df[colname], bins = binInterval, include_lowest=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['age_cat'] = filtered_df['age_cat'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])
A valu

# Preprocess Validation Data

In [11]:
### Pre-process validation data ###
print(f'Start preproecssing validation data.\n')

#1. Change dates to inbetween years
print(f'1. Change dates to number of years.')
val = dates_to_years(val, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
print(f'2. Categorize area code by 1000s.')
val = bin_area_code(val)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
print(f'3. Get differences of balances between timepoints.')
val = get_differences(val, list_balances)

#4. Add Spatial column (taken from miguel's dataset)
print(f'4. Add Spatial Column.\n')
val["Spatialclusters"] = mig_val["Spatialclusters"]

#5. Bin Ages
print(f'5. Bin Ages.\n')
val = bin_age(val, 'customer_birth_date_m1', 100)

#6. Fill in missing values
print(f'6. Impute Columns.\n')
# X_val = impute_col(X_val, 'numeric', collist)
val = impute_col(val, 'categorical', imputelist)

print(f'Finished preprocess of validation data.\n')

Start preproecssing validation data.

1. Change dates to number of years.
2. Categorize area code by 1000s.
3. Get differences of balances between timepoints.
4. Add Spatial Column.

5. Bin Ages.

6. Impute Columns.

Finished preprocess of validation data.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['age_cat'] = pd.cut(filtered_df[colname], bins = binInterval, include_lowest=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['age_cat'] = filtered_df['age_cat'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mode()[0])
A valu

# Preprocess Test Data

In [14]:
### Pre-process test data ###
print(f'Start preproecssing test data.\n')

#1. Change dates to inbetween years
print(f'1. Change dates to number of years.')
test = dates_to_years(df_test_merged, list_dates, base_date = '2018-01-01')

#2. Categorize Area Code by 1000s
print(f'2. Categorize area code by 1000s.')
test = bin_area_code(test)

#3. Get % Change from time point 1 to 2, 2 to 3, 1 to 3
print(f'3. Get differences of balances between timepoints.')
test = get_differences(test, list_balances)

#4. Add Spatial column (taken from miguel's dataset)
# print(f'4. Add Spatial Column.\n') #Miguel will add binnings
# test["Spatialclusters"] = mig_TEST["Spatialclusters"]

#5. Bin Ages
# print(f'5. Bin Ages.\n') #Miguel will add binnings
# test = bin_age(test, 'customer_birth_date_m1', 100)

#6. Fill in missing values
print(f'6. Impute Columns.\n')
# test = impute_col(test, 'numeric', collist)
test = impute_col(test, 'categorical', imputelist)

print(f'Finished preprocess of test data.\n')

Start preproecssing test data.

1. Change dates to number of years.
2. Categorize area code by 1000s.
3. Get differences of balances between timepoints.
6. Impute Columns.

Finished preprocess of validation data.



# Export Files

In [15]:
# Export X_train, X_val, y_train, y_val
print('Export data to /Downloads/ as train_final.csv, validation_final.csv, and test_final.csv.')
train.to_csv('/Users/rurikoimai/Downloads/train_final.csv', encoding='utf-8',index=False)
val.to_csv('/Users/rurikoimai/Downloads/validation_final.csv', encoding='utf-8',index=False)
test.to_csv('/Users/rurikoimai/Downloads/test_final.csv', encoding='utf-8',index=False)

Export data to /Downloads/ as train_final.csv, validation_final.csv, and test_final.csv.


# Check Processes (if things work as intended)...

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Construct mini dataset to check processes
dp = [["no",1,2,3,4,5],["yay",6,7,8,9,10],["yay",11,12,np.nan,14,15],["yay",16,17,18,19,20],[np.nan,21,22,23,24,25]]
df_split_test = pd.DataFrame(dp)
df_split_test

In [None]:
# Check reproducibility of train, val split 
X_train, X_val = train_test_split(df_split_test, test_size=0.2, random_state=0)
print(X_train)
print(X_val)

In [None]:
# check for numerical imputation: correct result should be (3+8+18+23)/4 = 13.0
impute_col(df_split_test, 'numeric', [3])

In [None]:
# check for categorical imputation: correct result should be "yay"
impute_col(df_split_test, 'categorical', [0])

In [16]:
train.head() #, X_val

Unnamed: 0,client_id,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_birth_date_m1,customer_postal_code_m1,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,homebanking_active,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_gender,customer_occupation_code,customer_self_employed,customer_children,customer_relationship,target,customer_commitment,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3,Spatialclusters,age_cat
45889,e5b6c98384d572c48a395e14d51fea32,0,0,0,0,0,0,0,0,0,20250,0,0,2.0,2.0,37.0,37.0,75,4100,0,0,0,0,0,0,0,0,0,20250,0,0,2.0,2.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,20250,0,0,2.0,2.0,1,9.0,0,no,couple,0,3,4000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[-0.074,0.08)","(73, 99]"
11145,dac57bbe7aceea320ecb282fbb3053d1,0,0,0,0,1810,8340,0,2800,0,15680,0,0,2.0,2.0,23.0,23.0,66,6250,0,0,0,0,1810,8090,0,3550,0,17960,0,0,2.0,2.0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,1810,7840,0,3520,0,15510,0,0,2.0,2.0,2,9.0,1,no,single,0,2,6000_area_code,0,0,0,0,0,0,-250,-250,-500,0,0,0,750,-30,720,0,0,0,2280,-2450,-170,"[-0.23,-0.074)","(52, 73]"
31185,e67d98937e84678ccc4b3fe2583ce4c5,0,0,0,0,220,0,0,2540,0,15280,0,0,1.0,1.0,25.0,8.0,65,6230,0,0,0,0,220,0,0,1670,0,15280,0,0,1.0,1.0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,220,0,0,2490,0,12280,0,0,1.0,1.0,1,9.0,0,no,couple,0,2,6000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,-870,820,-50,0,0,0,0,-3000,-3000,"[-0.23,-0.074)","(52, 73]"
23436,37a21e2841f17eb2c293209dd1c93651,0,0,0,0,0,0,0,0,0,5870,0,0,1.0,1.0,20.0,20.0,49,2275,0,0,0,0,0,0,0,0,0,5870,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5870,0,0,1.0,1.0,2,9.0,0,mature,couple,0,3,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[0.08,0.24)","(26, 52]"
49391,9059dda2fb582e4f804d98eb4c4e1a5d,0,0,0,0,90,2580,0,2520,4840,10190,0,0,2.0,1.0,36.0,36.0,57,4701,0,0,0,0,90,2440,0,2930,4880,9780,0,0,2.0,1.0,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,90,2300,0,2390,4930,10000,0,0,2.0,1.0,1,9.0,0,no,couple,0,3,4000_area_code,0,0,0,0,0,0,-140,-140,-280,0,0,0,410,-540,-130,40,50,90,-410,220,-190,"[0.08,0.24)","(52, 73]"


In [17]:
mig_train.head()

Unnamed: 0.1,Unnamed: 0,target,client_id,homebanking_active_m1,has_homebanking_m1,has_insurance_21_m1,has_insurance_23_m1,has_life_insurance_fixed_cap_m1,has_life_insurance_decreasing_cap_m1,has_fire_car_other_insurance_m1,has_personal_loan_m1,has_mortgage_loan_m1,has_current_account_m1,has_pension_saving_m1,has_savings_account_m1,has_savings_account_starter_m1,has_current_account_starter_m1,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_gender_m1,customer_birth_date_m1,customer_postal_code_m1,customer_occupation_code_m1,customer_self_employed_m1,customer_children_m1,customer_relationship_m1,homebanking_active_m2,has_homebanking_m2,has_insurance_21_m2,has_insurance_23_m2,has_life_insurance_fixed_cap_m2,has_life_insurance_decreasing_cap_m2,has_fire_car_other_insurance_m2,has_personal_loan_m2,has_mortgage_loan_m2,has_current_account_m2,has_pension_saving_m2,has_savings_account_m2,has_current_account_starter_m2,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,customer_self_employed_m2,customer_children_m2,customer_relationship_m2,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_self_employed,customer_children,customer_relationship,CODPOSS,COMMUNE,LAT,LONG,Latitude,Longitude,fit_spatial,Spatialclusters
0,1,0,e5b6c98384d572c48a395e14d51fea32,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,20250,0,0,2,2,1981-01,1981-01,1,1942-10,4100,9.0,0,no,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0,0,0.0,0,0,20250,0,0,2,2,0,no,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,20250,0,0,2,2,0,no,couple,4100,SERAING,50.6,5.5,50.6,5.5,0.227937,"[0.08,0.24)"
1,2,0,dac57bbe7aceea320ecb282fbb3053d1,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0.0,0.0,1810,8340,0.0,2800,0,15680,0,0,2,2,1994-09,1994-09,2,1951-11,6250,9.0,1,no,,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0.0,0.0,1810,8090,0.0,3550,0,17960,0,0,2,2,1,no,,0,1,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0.0,0.0,1810,7840,0.0,3520,0,15510,0,0,2,2,1,,single,6250,AISEAU-PRESLES,50.4,4.583333,50.4,4.583333,0.29226,"[0.24,0.42]"
2,3,0,e67d98937e84678ccc4b3fe2583ce4c5,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0.0,0.0,220,0,0.0,2540,0,15280,0,0,1,1,1992-03,2009-04,1,1952-11,6230,,0,no,couple,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0.0,0.0,220,0,0.0,1670,0,15280,0,0,1,1,0,no,couple,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0.0,0.0,220,0,0.0,2490,0,12280,0,0,1,1,0,no,couple,6230,PONT-A-CELLES,50.5,4.35,50.5,4.35,0.287407,"[0.24,0.42]"
3,4,0,37a21e2841f17eb2c293209dd1c93651,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,5870,0,0,1,1,1997-03,1997-03,2,1968-05,2275,9.0,0,mature,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0,0,0.0,0,0,5870,0,0,1,1,0,mature,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,5870,0,0,1,1,0,mature,couple,2275,LILLE,51.233333,4.816667,51.233333,4.816667,-0.158288,"[-0.23,-0.074)"
4,5,0,9059dda2fb582e4f804d98eb4c4e1a5d,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0.0,0.0,90,2580,0.0,2520,4840,10190,0,0,2,1,1981-03,1981-03,1,1960-05,4701,9.0,0,grownup,couple,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0.0,0.0,90,2440,0.0,2930,4880,9780,0,0,2,1,0,grownup,couple,1,1,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0.0,0.0,90,2300,0.0,2390,4930,10000,0,0,2,1,0,no,couple,4701,,,,,,0.098374,"[0.08,0.24)"


In [18]:
val.head()

Unnamed: 0,client_id,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_birth_date_m1,customer_postal_code_m1,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,homebanking_active,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_gender,customer_occupation_code,customer_self_employed,customer_children,customer_relationship,target,customer_commitment,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3,Spatialclusters,age_cat
50650,5a69d35f52c1b944ea938723ef5d8ab8,0,0,0,0,0,0,0,0,0,14830,0,0,1.0,1.0,8.0,8.0,58,8900,0,0,0,0,0,0,0,0,0,14830,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,14830,0,0,1.0,1.0,2,9.0,0,no,couple,0,1,8000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(52, 73]"
52809,b89cef910c72af5dc3ea44fccac6d976,0,0,0,0,0,0,18550,9540,0,0,0,0,1.0,1.0,17.0,17.0,50,2610,0,0,0,0,0,0,18370,10380,0,0,0,0,1.0,1.0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,18190,10350,0,0,0,0,1.0,1.0,1,9.0,0,adolescent,couple,0,3,2000_area_code,0,0,0,0,0,0,0,0,0,-180,-180,-360,840,-30,810,0,0,0,0,0,0,,"(26, 52]"
4861,c4044a6fbd0d7f4998ac12ea0abaf072,0,0,0,0,0,0,0,0,0,10340,0,0,1.0,1.0,4.0,4.0,63,6200,0,0,0,0,0,0,0,0,0,10340,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10340,0,0,1.0,1.0,2,9.0,0,no,couple,0,1,6000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[-0.23,-0.074)","(52, 73]"
22866,b798a4df52ad7d1d550ae0dc6089f134,0,0,0,0,800,0,0,0,0,37490,0,0,1.0,1.0,21.0,18.0,79,3590,0,0,0,0,800,0,0,0,0,37490,0,0,1.0,1.0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,800,0,0,0,0,37490,0,0,1.0,1.0,1,9.0,0,no,couple,0,3,3000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(73, 99]"
62496,ffb93657600876adef088c352dfc23be,0,0,0,0,1360,0,36590,2730,34040,30060,0,0,2.0,1.0,25.0,25.0,54,5000,0,0,0,0,1420,0,36210,3760,34330,25060,0,0,2.0,1.0,1,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,1420,0,35820,1950,34700,23060,0,0,2.0,1.0,2,9.0,0,no,couple,0,1,5000_area_code,0,0,0,0,0,0,0,0,0,-380,-390,-770,1030,-1810,-780,290,370,660,-5000,-2000,-7000,,"(52, 73]"


In [19]:
mig_val.head()

Unnamed: 0.1,Unnamed: 0,target,client_id,homebanking_active_m1,has_homebanking_m1,has_insurance_21_m1,has_insurance_23_m1,has_life_insurance_fixed_cap_m1,has_life_insurance_decreasing_cap_m1,has_fire_car_other_insurance_m1,has_personal_loan_m1,has_mortgage_loan_m1,has_current_account_m1,has_pension_saving_m1,has_savings_account_m1,has_savings_account_starter_m1,has_current_account_starter_m1,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_gender_m1,customer_birth_date_m1,customer_postal_code_m1,customer_occupation_code_m1,customer_self_employed_m1,customer_children_m1,customer_relationship_m1,homebanking_active_m2,has_homebanking_m2,has_insurance_21_m2,has_insurance_23_m2,has_life_insurance_fixed_cap_m2,has_life_insurance_decreasing_cap_m2,has_fire_car_other_insurance_m2,has_personal_loan_m2,has_mortgage_loan_m2,has_current_account_m2,has_pension_saving_m2,has_savings_account_m2,has_current_account_starter_m2,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,customer_self_employed_m2,customer_children_m2,customer_relationship_m2,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_self_employed,customer_children,customer_relationship,CODPOSS,COMMUNE,LAT,LONG,fit_spatial,Spatialclusters
0,1,0,5a69d35f52c1b944ea938723ef5d8ab8,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,14830,0,0,1,1,2009-12,2009-12,2,1959-04,8900,9.0,0,mature,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,0.0,0,0,14830,0,0,1,1,0,mature,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,14830,0,0,1,1,0,,,8900,IEPER,50.85,2.883333,-0.181607,"[-0.23,-0.074)"
1,2,0,b89cef910c72af5dc3ea44fccac6d976,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0.0,0,0,18550.0,9540,0,0,0,0,1,1,2001-01,2001-01,1,1967-02,2610,9.0,0,adolescent,couple,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0.0,0,0,18370.0,10380,0,0,0,0,1,1,0,adolescent,couple,1,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0.0,0,0,18190.0,10350,0,0,0,0,1,1,0,adolescent,couple,2610,,,,0.003099,"[-0.074,0.08)"
2,3,0,c4044a6fbd0d7f4998ac12ea0abaf072,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,10340,0,0,1,1,2013-11,2013-11,2,1954-04,6200,9.0,0,,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,0.0,0,0,10340,0,0,1,1,0,,,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,10340,0,0,1,1,0,,,6200,CHATELET,50.4,4.516667,0.284887,"[0.24,0.42]"
3,4,0,b798a4df52ad7d1d550ae0dc6089f134,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0.0,800,0,0.0,0,0,37490,0,0,1,1,1996-12,1999-09,1,1938-10,3590,9.0,0,no,couple,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0.0,800,0,0.0,0,0,37490,0,0,1,1,0,no,couple,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0.0,800,0,0.0,0,0,37490,0,0,1,1,0,no,couple,3590,DIEPENBEEK,50.9,5.416667,0.083249,"[0.08,0.24)"
4,5,0,ffb93657600876adef088c352dfc23be,1,1,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0.0,1360,0,36590.0,2730,34040,30060,0,0,2,1,1992-06,1992-06,2,1964-01,5000,9.0,0,,,1,1,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0.0,1420,0,36210.0,3760,34330,25060,0,0,2,1,0,,,1,1,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0.0,1420,0,35820.0,1950,34700,23060,0,0,2,1,0,,,5000,NAMUR,50.45,4.85,0.303357,"[0.24,0.42]"


In [20]:
train.tail()

Unnamed: 0,client_id,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_birth_date_m1,customer_postal_code_m1,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,homebanking_active,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_gender,customer_occupation_code,customer_self_employed,customer_children,customer_relationship,target,customer_commitment,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3,Spatialclusters,age_cat
45891,9c1eeab2c7613fcde296ec0520baa2f5,0,0,0,0,180,0,0,640,0,43400,0,0,2.0,1.0,20.0,20.0,85,9120,0,0,0,0,180,0,0,780,0,44150,0,0,2.0,1.0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,180,0,0,580,0,44900,0,0,2.0,1.0,1,9.0,0,no,single,0,2,9000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,140,-200,-60,0,0,0,750,750,1500,"[-0.23,-0.074)","(73, 99]"
52416,389d1bf6c04c3ccfaf400fe0cbd300c2,0,0,0,0,0,0,0,0,0,11460,0,0,1.0,1.0,15.0,15.0,78,2840,0,0,0,0,0,0,0,0,0,11460,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,11460,0,0,1.0,1.0,1,9.0,0,no,single,0,2,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(73, 99]"
42613,a6298522e256fe6f9c5b05cc9fd04ca2,0,0,0,480,0,0,0,2580,0,10200,0,0,2.0,1.0,17.0,16.0,42,4500,0,0,0,480,0,0,0,1230,0,10780,0,0,2.0,1.0,1,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0,480,0,0,0,850,0,10280,0,0,2.0,1.0,2,9.0,0,adolescent,couple,0,2,4000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,-1350,-380,-1730,0,0,0,580,-500,80,"[-0.23,-0.074)","(26, 52]"
43567,8a02a3969b8da13c88dbffca568969a7,6270,0,0,0,0,0,0,0,0,25150,0,0,1.0,1.0,16.0,16.0,40,9230,6280,0,0,0,0,0,0,0,0,25150,0,0,1.0,1.0,0,1,0,0,0,0,0,0,0,0,1,0,0,6430,0,0,0,0,0,0,0,0,25150,0,0,1.0,1.0,2,9.0,1,no,single,0,2,9000_area_code,10,150,160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[-0.23,-0.074)","(26, 52]"
2732,3a0dcc5c4c2c7c36d4cfb08f5766a53c,0,0,0,0,0,0,0,0,0,10420,0,0,1.0,1.0,18.0,18.0,41,2860,0,0,0,0,0,0,0,0,0,10520,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,10620,0,0,1.0,1.0,2,9.0,0,young,couple,0,3,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,100,100,200,"[0.08,0.24)","(26, 52]"


In [21]:
mig_train.tail()

Unnamed: 0.1,Unnamed: 0,target,client_id,homebanking_active_m1,has_homebanking_m1,has_insurance_21_m1,has_insurance_23_m1,has_life_insurance_fixed_cap_m1,has_life_insurance_decreasing_cap_m1,has_fire_car_other_insurance_m1,has_personal_loan_m1,has_mortgage_loan_m1,has_current_account_m1,has_pension_saving_m1,has_savings_account_m1,has_savings_account_starter_m1,has_current_account_starter_m1,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_gender_m1,customer_birth_date_m1,customer_postal_code_m1,customer_occupation_code_m1,customer_self_employed_m1,customer_children_m1,customer_relationship_m1,homebanking_active_m2,has_homebanking_m2,has_insurance_21_m2,has_insurance_23_m2,has_life_insurance_fixed_cap_m2,has_life_insurance_decreasing_cap_m2,has_fire_car_other_insurance_m2,has_personal_loan_m2,has_mortgage_loan_m2,has_current_account_m2,has_pension_saving_m2,has_savings_account_m2,has_current_account_starter_m2,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,customer_self_employed_m2,customer_children_m2,customer_relationship_m2,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_self_employed,customer_children,customer_relationship,CODPOSS,COMMUNE,LAT,LONG,Latitude,Longitude,fit_spatial,Spatialclusters
50952,50953,0,9c1eeab2c7613fcde296ec0520baa2f5,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0.0,0.0,180,0,0.0,640,0,43400,0,0,2,1,1997-10,1997-10,1,1932-11,9120,9.0,0,,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0.0,0.0,180,0,0.0,780,0,44150,0,0,2,1,0,,single,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0.0,0.0,180,0,0.0,580,0,44900,0,0,2,1,0,,single,9120,BEVEREN,51.2,4.25,51.2,4.25,-0.067207,"[-0.074,0.08)"
50953,50954,0,389d1bf6c04c3ccfaf400fe0cbd300c2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,11460,0,0,1,1,2002-11,2002-11,1,1939-11,2840,9.0,0,,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0,0,0.0,0,0,11460,0,0,1,1,0,,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,11460,0,0,1,1,0,,single,2840,RUMST,51.066667,4.416667,51.066667,4.416667,0.049328,"[-0.074,0.08)"
50954,50955,0,a6298522e256fe6f9c5b05cc9fd04ca2,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0.0,480.0,0,0,0.0,2580,0,10200,0,0,2,1,2001-01,2001-07,2,1975-07,4500,,0,adolescent,couple,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,0.0,480.0,0,0,0.0,1230,0,10780,0,0,2,1,0,adolescent,couple,1,1,0,0,0,1,1,0,0,1,0,1,0,0,0,0,0.0,480.0,0,0,0.0,850,0,10280,0,0,2,1,0,adolescent,couple,4500,HUY,50.516667,5.233333,50.516667,5.233333,0.304953,"[0.24,0.42]"
50955,50956,0,8a02a3969b8da13c88dbffca568969a7,0,1,1,0,0,0,0,0,0,0,0,1,0,0,6270,0,0.0,0.0,0,0,0.0,0,0,25150,0,0,1,1,2001-06,2001-12,2,1977-03,9230,9.0,1,,single,0,1,1,0,0,0,0,0,0,0,0,1,0,6280,0,0.0,0.0,0,0,0.0,0,0,25150,0,0,1,1,1,,single,0,1,1,0,0,0,0,0,0,0,0,1,0,0,6430,0,0.0,0.0,0,0,0.0,0,0,25150,0,0,1,1,1,,single,9230,WETTEREN,51.0,3.883333,51.0,3.883333,-0.144622,"[-0.23,-0.074)"
50956,50957,0,3a0dcc5c4c2c7c36d4cfb08f5766a53c,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,10420,0,0,1,1,2000-01,2000-01,2,1976-10,2860,9.0,0,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0,0,0.0,0,0,10520,0,0,1,1,0,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0.0,0,0,0.0,0,0,10620,0,0,1,1,0,young,couple,2860,SINT-KATELIJNE-WAVER,51.066667,4.533333,51.066667,4.533333,0.059275,"[-0.074,0.08)"


In [22]:
val.tail()

Unnamed: 0,client_id,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_birth_date_m1,customer_postal_code_m1,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,homebanking_active,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_gender,customer_occupation_code,customer_self_employed,customer_children,customer_relationship,target,customer_commitment,area_cat,bal_insurance_21_1,bal_insurance_21_2,bal_insurance_21_3,bal_insurance_23_1,bal_insurance_23_2,bal_insurance_23_3,bal_personal_loan_1,bal_personal_loan_2,bal_personal_loan_3,bal_mortgage_loan_1,bal_mortgage_loan_2,bal_mortgage_loan_3,bal_current_account_1,bal_current_account_2,bal_current_account_3,bal_pension_saving_1,bal_pension_saving_2,bal_pension_saving_3,bal_savings_account_1,bal_savings_account_2,bal_savings_account_3,Spatialclusters,age_cat
14294,402dc50b8db3399ac064c7e3e271123f,0,0,0,0,0,0,0,0,0,6600,0,0,1.0,1.0,19.0,19.0,48,9031,0,0,0,0,0,0,0,0,0,6600,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,6600,0,0,1.0,1.0,2,9.0,0,adolescent,couple,0,3,9000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(26, 52]"
4953,0b8025fef0fd26f4d7187331b5c5b119,0,0,0,0,0,0,0,0,0,29910,0,0,1.0,1.0,20.0,20.0,65,2100,0,0,0,0,0,0,0,0,0,29710,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,29710,0,0,1.0,1.0,2,9.0,0,no,single,0,2,2000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-200,0,-200,"[-0.23,-0.074)","(52, 73]"
36328,d54fe1183d5a8b9deb2a0d6b8c8e165d,0,0,0,0,0,0,0,0,0,12940,0,0,1.0,1.0,10.0,10.0,44,9070,0,0,0,0,0,0,0,0,0,12940,0,0,1.0,1.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,12940,0,0,1.0,1.0,1,9.0,1,young,couple,0,3,9000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(26, 52]"
19929,41c4dd415ff1d090154f70096b4a11af,0,0,0,0,0,0,0,0,0,30980,0,0,1.0,1.0,37.0,37.0,70,8340,0,0,0,0,0,0,0,0,0,30980,0,0,1.0,1.0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,30980,0,0,1.0,1.0,1,9.0,0,no,couple,0,3,8000_area_code,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,"(52, 73]"
22269,87ea57e04dabe9262fc20a62c68ad87d,860,0,0,42810,150,0,38440,2450,0,10420,0,0,1.0,1.0,9.0,9.0,41,1060,860,0,0,42810,150,0,37890,4280,0,6420,0,0,1.0,1.0,1,1,0,0,1,1,0,1,1,0,1,0,0,860,0,0,42810,150,0,37340,3870,0,6420,0,0,1.0,1.0,2,7.0,0,no,couple,0,1,1000_area_code,0,0,0,0,0,0,0,0,0,-550,-550,-1100,1830,-410,1420,0,0,0,-4000,0,-4000,,"(26, 52]"


In [23]:
mig_val.tail()

Unnamed: 0.1,Unnamed: 0,target,client_id,homebanking_active_m1,has_homebanking_m1,has_insurance_21_m1,has_insurance_23_m1,has_life_insurance_fixed_cap_m1,has_life_insurance_decreasing_cap_m1,has_fire_car_other_insurance_m1,has_personal_loan_m1,has_mortgage_loan_m1,has_current_account_m1,has_pension_saving_m1,has_savings_account_m1,has_savings_account_starter_m1,has_current_account_starter_m1,bal_insurance_21_m1,bal_insurance_23_m1,cap_life_insurance_fixed_cap_m1,cap_life_insurance_decreasing_cap_m1,prem_fire_car_other_insurance_m1,bal_personal_loan_m1,bal_mortgage_loan_m1,bal_current_account_m1,bal_pension_saving_m1,bal_savings_account_m1,bal_savings_account_starter_m1,bal_current_account_starter_m1,visits_distinct_so_m1,visits_distinct_so_areas_m1,customer_since_all_m1,customer_since_bank_m1,customer_gender_m1,customer_birth_date_m1,customer_postal_code_m1,customer_occupation_code_m1,customer_self_employed_m1,customer_children_m1,customer_relationship_m1,homebanking_active_m2,has_homebanking_m2,has_insurance_21_m2,has_insurance_23_m2,has_life_insurance_fixed_cap_m2,has_life_insurance_decreasing_cap_m2,has_fire_car_other_insurance_m2,has_personal_loan_m2,has_mortgage_loan_m2,has_current_account_m2,has_pension_saving_m2,has_savings_account_m2,has_current_account_starter_m2,bal_insurance_21_m2,bal_insurance_23_m2,cap_life_insurance_fixed_cap_m2,cap_life_insurance_decreasing_cap_m2,prem_fire_car_other_insurance_m2,bal_personal_loan_m2,bal_mortgage_loan_m2,bal_current_account_m2,bal_pension_saving_m2,bal_savings_account_m2,bal_savings_account_starter_m2,bal_current_account_starter_m2,visits_distinct_so_m2,visits_distinct_so_areas_m2,customer_self_employed_m2,customer_children_m2,customer_relationship_m2,homebanking_active,has_homebanking,has_insurance_21,has_insurance_23,has_life_insurance_fixed_cap,has_life_insurance_decreasing_cap,has_fire_car_other_insurance,has_personal_loan,has_mortgage_loan,has_current_account,has_pension_saving,has_savings_account,has_savings_account_starter,has_current_account_starter,bal_insurance_21,bal_insurance_23,cap_life_insurance_fixed_cap,cap_life_insurance_decreasing_cap,prem_fire_car_other_insurance,bal_personal_loan,bal_mortgage_loan,bal_current_account,bal_pension_saving,bal_savings_account,bal_savings_account_starter,bal_current_account_starter,visits_distinct_so,visits_distinct_so_areas,customer_self_employed,customer_children,customer_relationship,CODPOSS,COMMUNE,LAT,LONG,fit_spatial,Spatialclusters
12735,12736,0,402dc50b8db3399ac064c7e3e271123f,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,6600,0,0,1,1,1999-01,1999-01,2,1969-12,9031,9.0,0,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,0.0,0,0,6600,0,0,1,1,0,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,6600,0,0,1,1,0,adolescent,couple,9031,,,,-0.203585,"[-0.23,-0.074)"
12736,12737,0,0b8025fef0fd26f4d7187331b5c5b119,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,29910,0,0,1,1,1997-03,1997-03,2,1952-10,2100,9.0,0,,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,0.0,0,0,29710,0,0,1,1,0,,single,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,29710,0,0,1,1,0,,single,2100,,,,-0.016163,"[-0.074,0.08)"
12737,12738,0,d54fe1183d5a8b9deb2a0d6b8c8e165d,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,12940,0,0,1,1,2007-05,2007-05,1,1973-09,9070,9.0,1,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,0,0,0.0,0,0,12940,0,0,1,1,1,young,couple,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,12940,0,0,1,1,1,young,couple,9070,DESTELBERGEN,51.05,3.8,-0.180769,"[-0.23,-0.074)"
12738,12739,0,41c4dd415ff1d090154f70096b4a11af,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,30980,0,0,1,1,1981-01,1981-01,1,1947-03,8340,9.0,0,no,couple,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0.0,0,0,0.0,0,0,30980,0,0,1,1,0,no,couple,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0.0,0,0,0.0,0,0,30980,0,0,1,1,0,no,couple,8340,DAMME,51.25,3.283333,-0.134623,"[-0.23,-0.074)"
12739,12740,0,87ea57e04dabe9262fc20a62c68ad87d,1,1,1,0,0,1,1,0,1,1,0,1,0,0,860,0,0,42810.0,150,0,38440.0,2450,0,10420,0,0,1,1,2008-08,2008-08,2,1976-12,1060,7.0,0,,,1,1,1,0,0,1,1,0,1,1,0,1,0,860,0,0,42810.0,150,0,37890.0,4280,0,6420,0,0,1,1,0,,,1,1,1,0,0,1,1,0,1,1,0,1,0,0,860,0,0,42810.0,150,0,37340.0,3870,0,6420,0,0,1,1,0,,,1060,SINT-GILLIS,50.816667,4.333333,0.229463,"[0.08,0.24)"


In [None]:
train['customer_since_all_m1'].head() #mode

In [None]:
train['customer_since_bank_m1'].head() #mode

In [None]:
train['customer_occupation_code'].head() #mode

In [None]:
train['customer_children'].head() #mode

In [None]:
train['customer_relationship'].head() #mode

In [None]:
# see which columns need to be imputed
train.columns[X_train.isnull().any()]

In [None]:
val.columns[X_val.isnull().any()]

In [13]:
df_test_merged.columns[df_test_merged.isnull().any()]

Index(['customer_since_all_m1', 'customer_since_bank_m1',
       'customer_occupation_code', 'customer_children',
       'customer_relationship'],
      dtype='object')

In [None]:
from datetime import date
def age(birthdate):
    # Get today's date object
#     today = date.today()
    today = pd.to_datetime('2018-01-01')
    
    # A bool that represents if today's day/month precedes the birth day/month
    one_or_zero = ((today.month, today.day) < (birthdate.month, birthdate.day))
    
    # Calculate the difference in years from the date object's components
    year_difference = today.year - birthdate.year
    age = year_difference - one_or_zero
    
    return age
     
# Example age check:
print(age(date(1996, 7, 28)))

In [None]:
dp = [['2000-01-01','1990-07-15'],['1996-07-28','1998-03-15']]
example = pd.DataFrame(dp)
example

In [None]:
# Change dates to inbetween years
def dates_to_years(df, col_list, base_date = '2018-01-01'):
    #Convert date columns into datetime format
    df['base_dt'] = pd.to_datetime(base_date)
    df[col_list] = df[col_list].apply(pd.to_datetime)

    for col in col_list:
        df[col] = df[col].apply(lambda x : age(x))
        
    #Drop columns (base_dt)
    df = df.drop('base_dt', axis=1)

    return df
# Example age check:
df_age_test = dates_to_years(example, [0,1], base_date = '2018-01-01')
print(df_age_test)

In [None]:
# Bin Age
def bin_age(df, colname, upperLim):
    
    # Get all data below upper age limit : 100 years old in our case
    df_mask = df[colname]  < upperLim
    filtered_df = df[df_mask]
    
    # since lower bound is not included even when i set include_lowest=True, I will lower the lowerbound
    binInterval = pd.IntervalIndex.from_tuples([(17,20), (20,26), (26,52), (52,73), (73,99)])
    filtered_df['age_cat'] = pd.cut(filtered_df[colname], bins = binInterval, include_lowest=True)
    
    return filtered_df# Define Preprocess Steps


In [None]:
dp = [[21],[27],[28],[53],[54],[74],[75],[99],[100]]
age_df = pd.DataFrame(dp)
age_df

In [None]:
bin_age(age_df, 0, 100)