In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

def apk(actual, predicted, k=7, default=0.0):
    # Since it is MAP@7, use up to 7
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        # Conditions for giving points are as follows:
        # The predicted value is in the correct answer ('p in actual')
        # If the predicted value is not duplicate ('p not in predicted[:i]')
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    # If the correct answer value is blank, 0.0 is unconditionally returned.
    if not actual:
        return default

    # Calculate the average precision by the number of correct answers (len(actual))
    return score / min(len(actual), k)

def mapk(actual, predicted, k=7, default=0.0):
    # Calculate the Average Precision for each customer from the list of list, 
    #actual and predicted values, and calculate the average through np.mean()
    return np.mean([apk(a, p, k, default) for a, p in zip(actual, predicted)]) 

In [2]:
np.random.seed(2018)
# Load data.
trn = pd.read_csv('../Train_Cleaned.csv')

tst = pd.read_csv('../Test_Cleaned.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
trn.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,...,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0,6,1,...,0,0,0,0,0,0,0,0,0,0
1,2015-01-28,1050611,N,ES,V,23,2012-08-10,0,35,1,...,0,0,0,0,0,0,0,0,0,0
2,2015-01-28,1050612,N,ES,V,23,2012-08-10,0,35,1,...,0,0,0,0,0,0,0,0,0,0
3,2015-01-28,1050613,N,ES,H,22,2012-08-10,0,35,1,...,0,0,0,0,0,0,0,0,0,0
4,2015-01-28,1050614,N,ES,V,23,2012-08-10,0,35,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
trn.shape

(13647309, 46)

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
## Data preprocessing ##

# Store product variables separately.
prods = trn.columns[22:].tolist()

In [7]:
prods

['Saving_account',
 'Guarantees',
 'Cur_account',
 'Derivative_account',
 'Payroll_account',
 'Junior_account',
 'Particular_acct1',
 'Particular_acct2',
 'Particular_acct3',
 'Short_term_deposites',
 'Med_term_deposites',
 'Long_term_deposites',
 'e-account',
 'Funds',
 'Mortgage',
 'Pension',
 'Loans',
 'Taxes',
 'Credit_card',
 'Securities',
 'Home_account',
 'Payroll',
 'Pensions',
 'Direct_debit']

In [8]:
trn.isnull().sum()

Month_status_date         0
Customer_ID               0
Employee_Index            0
Customer_country          0
Sex                       0
Age                       0
Join_date                 0
New_customer              0
Relnshp_Mnths             0
Relnshp_flag              0
Last_date_Prim_Cust       0
Cust_type_beg_Mth         0
Cust_Reln_type_beg_mth    0
Residence_flag            0
Forigner_flag             0
Emp_spouse_flag           0
Channel_when_joined       0
Deceased_flag             0
Address_detail            0
Activity_flag             0
Gross_household_income    0
Segment                   0
Saving_account            0
Guarantees                0
Cur_account               0
Derivative_account        0
Payroll_account           0
Junior_account            0
Particular_acct1          0
Particular_acct2          0
Particular_acct3          0
Short_term_deposites      0
Med_term_deposites        0
Long_term_deposites       0
e-account                 0
Funds               

In [9]:
# Replace missing values of product variables with 0 in advance.
#trn[prods] = trn[prods].fillna(0.0).astype(np.int8)

In [10]:
trn['Join_date'].unique()

array(['2015-01-12', '2012-08-10', '2011-09-06', ..., '2016-05-25',
       '2016-05-01', '2016-05-15'], dtype=object)

In [11]:
# Remove customer data that doesn't have any of the 24 products.
no_product = trn[prods].sum(axis=1) == 0
trn = trn[~no_product]

In [12]:
trn.shape

(11091070, 46)

In [13]:
tst["Customer_ID"].nunique()

929615

In [14]:
tst.shape

(929615, 22)

In [15]:
tst.Month_status_date.unique()

array(['2016-06-28'], dtype=object)

In [16]:
# Integrate training data and test data. Product variables that are not in the test data are filled with zeros.
for col in trn.columns[22:]:
    tst[col] = 0
df = pd.concat([trn, tst], axis=0)

In [17]:
df.drop(['Last_date_Prim_Cust','Emp_spouse_flag'], axis = 1, inplace=True)

In [18]:
df[df['Join_date'] == 'UNKNOWN']

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit


In [19]:
len(df['Join_date'].unique())

6756

In [20]:
df.head(10)

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2015-01-28,1050611,N,ES,V,23,2012-08-10,0,35,1,1,I,S,S,KHE,N,CIUDAD REAL,0,35548.74,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2015-01-28,1050612,N,ES,V,23,2012-08-10,0,35,1,1,I,S,N,KHE,N,CIUDAD REAL,0,122179.11,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2015-01-28,1050613,N,ES,H,22,2012-08-10,0,35,1,1,I,S,N,KHD,N,ZARAGOZA,0,119775.54,03 - UNIVERSITARIO,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2015-01-28,1050614,N,ES,V,23,2012-08-10,0,35,1,1,A,S,N,KHE,N,ZARAGOZA,1,99950.28,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2015-01-28,1050615,N,ES,H,23,2012-08-10,0,35,1,1,I,S,N,KHE,N,TOLEDO,0,22220.04,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,2015-01-28,1050616,N,ES,H,23,2012-08-10,0,35,1,1,I,S,N,KHE,N,LEON,0,295590.36,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,2015-01-28,1050617,N,ES,H,23,2012-08-10,0,35,1,1,A,S,N,KHE,N,ZARAGOZA,1,113316.66,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,2015-01-28,1050619,N,ES,H,24,2012-08-10,0,35,1,1,I,S,N,KHE,N,GIPUZKOA,0,80599.14,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,2015-01-28,1050620,N,ES,H,23,2012-08-10,0,35,1,1,I,S,N,KHE,N,CACERES,0,113194.98,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
df[df['Customer_ID'] == 1375586]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1047196,2015-02-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1681276,2015-03-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2299297,2015-04-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2714121,2015-05-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3343336,2015-06-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4581564,2015-07-28,1375586,N,ES,H,35,2015-01-12,0,6,1,1,A,S,N,KHL,N,MALAGA,1,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
5075380,2015-08-28,1375586,N,ES,H,36,2015-01-12,0,7,1,1,A,S,N,KHL,N,MALAGA,0,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6258167,2015-09-28,1375586,N,ES,H,36,2015-01-12,0,8,1,1,I,S,N,KHL,N,MALAGA,0,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6572487,2015-10-28,1375586,N,ES,H,36,2015-01-12,0,9,1,1,I,S,N,KHL,N,MALAGA,0,87218.1,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
df[df['Customer_ID'] == 658229]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit
507732,2015-01-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
746776,2015-02-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1390338,2015-03-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2002953,2015-04-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3009480,2015-05-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3641162,2015-06-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3785633,2015-07-28,658229,N,ES,V,33,2006-11-20,0,104,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4912328,2015-08-28,658229,N,ES,V,33,2006-11-20,0,105,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5601110,2015-09-28,658229,N,ES,V,33,2006-11-20,0,106,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7163874,2015-10-28,658229,N,ES,V,33,2006-11-20,0,107,1,1,A,S,N,KFC,N,SEVILLA,1,115696.41,03 - UNIVERSITARIO,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1


In [23]:
# This is a list containing variables to be used for learning.
features = []

# Label-encode categorical variables through the .factorize() function.
categorical_cols = ['Employee_Index', 'Customer_country', 'Sex', 'Cust_Reln_type_beg_mth', 'Residence_flag',
                    'Forigner_flag', 'Channel_when_joined', 'Deceased_flag', 'Address_detail', 'Segment']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols

In [24]:
features

['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment']

In [25]:
df.shape

(12020685, 44)

In [26]:
df.loc[df.Relnshp_Mnths <0, "Relnshp_Mnths"] = 0

In [27]:
df['Relnshp_Mnths'].unique()

array([  6,  35,  34,   0,  33,  31,  21,  16,  27,   9,  22,  13,  29,
         8,  11,  10,  28,  24,   7,  25,  14,  12,  26,  23,   1,  18,
         4,   3,  17,  32,  20,  15,  30,  19, 157,  36,  40,  38,  37,
        39,   5,  47,  44,  42,  46,  45,  43,  41,  57,  48,  52,  49,
        50,  56,  58,  51,  55,  54,  53,  59,  62,  61,  60,  63,   2,
       139, 165, 118, 164,  94, 159, 143, 105, 151, 162, 137, 150, 128,
       122, 156, 119, 160,  79,  95, 132, 161,  98, 127,  72, 155, 108,
       163, 102, 148, 115, 146, 107,  81, 216, 135,  92, 121, 198, 134,
        93, 140, 110, 120, 147,  64,  77,  85,  99,  78, 100, 113, 154,
       166, 124, 141,  66, 117,  86, 193,  80, 144,  87, 126, 158, 101,
       116, 235,  88, 145, 103, 149, 109, 131,  97, 133,  68,  84, 232,
       125, 177, 112,  96,  69, 171, 142, 167, 104,  76,  82, 152,  70,
       138, 169,  65, 129, 190, 114, 111, 176, 153,  89, 136,  83, 123,
       187, 106, 231, 189, 217, 172, 199, 173, 174, 209, 180, 17

In [28]:
df['Cust_type_beg_Mth'].replace('P', 5, inplace=True)
df['Cust_type_beg_Mth'].replace('1', 1, inplace=True)
df['Cust_type_beg_Mth'].replace('2', 2, inplace=True)
df['Cust_type_beg_Mth'].replace('3', 3, inplace=True)
df['Cust_type_beg_Mth'].replace('4', 4, inplace=True)

In [29]:
df['Cust_type_beg_Mth'].unique()

array([1, 5, 3, 2, 4], dtype=int64)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12020685 entries, 0 to 929614
Data columns (total 44 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Month_status_date       object 
 1   Customer_ID             int64  
 2   Employee_Index          int64  
 3   Customer_country        int64  
 4   Sex                     int64  
 5   Age                     int64  
 6   Join_date               object 
 7   New_customer            int64  
 8   Relnshp_Mnths           int64  
 9   Relnshp_flag            int64  
 10  Cust_type_beg_Mth       int64  
 11  Cust_Reln_type_beg_mth  int64  
 12  Residence_flag          int64  
 13  Forigner_flag           int64  
 14  Channel_when_joined     int64  
 15  Deceased_flag           int64  
 16  Address_detail          int64  
 17  Activity_flag           int64  
 18  Gross_household_income  float64
 19  Segment                 int64  
 20  Saving_account          int64  
 21  Guarantees              int64  

In [31]:
# Substitute -99 for singular and missing values of numeric variables and convert them to integers.
#df['age'].replace(' NA', -99, inplace=True)
#df['age'] = df['age'].astype(np.int8)

#df['antiguedad'].replace('     NA', -99, inplace=True)
#df['antiguedad'] = df['antiguedad'].astype(np.int8)

#df['renta'].replace('         NA', -99, inplace=True)
#df['renta'].fillna(-99, inplace=True)
#df['renta'] = df['renta'].astype(float).astype(np.int8)

#df['indrel_1mes'].replace('P', 5, inplace=True)
#df['indrel_1mes'].fillna(-99, inplace=True)
#df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)

# Seek features for numeric variables to be used for learning.
features += ['Age','Relnshp_Mnths','Gross_household_income','New_customer','Relnshp_flag','Cust_type_beg_Mth','Activity_flag']


In [32]:
len(features)

17

In [33]:
features

['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag']

In the feature engineering stage, derived variables are created to be used for machine learning model training. In the baseline model, a total of 24 customer variables, 4 date-based derived variables, and 24 lag-1 variables are used.

The year and month information are extracted from fecha_alta, which means the date when the customer signed the first contract, and ult_fec_cli_1t, which means the date when the customer was last grade 1, respectively. In addition, there are various derived variables using date variables. For example, the difference between two date variables can be created as a derived variable.

Missing values ​​are temporarily replaced with -99. The machine learning model provided by Scikit-learn does not receive a missing value as an input value and an execution error occurs, but in the xgboost model, the missing value is also received as a normal input value. The missing data is recognized as one piece of information and used for model training, but I will set the missing value to -99.

In time series data, various derived variables can be created based on customer's past data. For example, whether a customer's age has changed in the last 3 months (i.e., has a birthday within 3 months) can be created as a binary variable, or information about a product purchased a month ago can be used as a variable, You can also calculate the average monthly salary for the last six months.

Starting from the conclusion, the lag variable indicating whether or not a financial product was held N months ago served as a good derivative. It is to use as current customer data whether it was held 1 month ago, 2 months ago, or 3 months ago for 24 financial product variables. In the baseline model, we will use the lag-1 to lag-5 variables that retrieve information from 1 month ago to 5 months ago.

In [34]:
# (Feature Engineering) Extract year and month information from two date variables.
df['Join_date_month'] = df['Join_date'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['Join_date_year'] = df['Join_date'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['Join_date_month', 'Join_date_year']

In [35]:
#df['Last_date_Prim_Cust_month'] = df['Last_date_Prim_Cust'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
#df['Last_date_Prim_Cust_year'] = df['Last_date_Prim_Cust'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
#features += ['Last_date_Prim_Cust_month', 'Last_date_Prim_Cust_year']

In [36]:
#All missing values of other variables are replaced with -99.
df.fillna(-99, inplace=True)

# (Feature Engineering) Generate lag-1 data.

# This is a function that converts a date to a number. 2015-01-28 is converted to 1, 2016-06-28 is converted to 18
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")] 
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date

# Convert the date to a number and store it in int_date
df['int_date'] = df['Month_status_date'].map(date_to_int).astype(np.int8)

In [37]:
df.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date
0,2015-01-28,1375586,0,0,0,35,2015-01-12,0,6,1,1,0,0,0,0,0,0,1,87218.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2015,1
1,2015-01-28,1050611,0,0,1,23,2012-08-10,0,35,1,1,1,0,1,1,0,1,0,35548.74,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1
2,2015-01-28,1050612,0,0,1,23,2012-08-10,0,35,1,1,1,0,0,1,0,1,0,122179.11,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1
3,2015-01-28,1050613,0,0,0,22,2012-08-10,0,35,1,1,1,0,0,2,0,2,0,119775.54,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1
4,2015-01-28,1050614,0,0,1,23,2012-08-10,0,35,1,1,0,0,0,1,0,2,1,99950.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1


In [38]:
#Convert the date to a number, copy the data # to int_date, and add 12 to the int_date date to create a lag_12. 
#Add _prev to variable name and save
df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['Customer_ID', 'int_date'] else col for col in df.columns ]
df_lag['int_date'] += 12

In [39]:
#df_years = df.copy()
#df_years.columns = [col + '_years' if col not in ['ncodpers', 'int_date'] else col for col in df.columns ]
#df_years['int_date'] += 12

In [40]:
# Combine the original data and lag data based on ncodper and int_date. 
#Since the int_date of the Lag data is 1, the product information of the last month is inserted.
#df_trn = df.merge(df_lag, on=['Customer_ID','int_date'], how='left')

In [41]:
df_trn = df.merge(df_lag, on=['Customer_ID','int_date'], how='left')

In [42]:
df_trn.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
0,2015-01-28,1375586,0,0,0,35,2015-01-12,0,6,1,1,0,0,0,0,0,0,1,87218.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2015,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2015-01-28,1050611,0,0,1,23,2012-08-10,0,35,1,1,1,0,1,1,0,1,0,35548.74,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2015-01-28,1050612,0,0,1,23,2012-08-10,0,35,1,1,1,0,0,1,0,1,0,122179.11,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2015-01-28,1050613,0,0,0,22,2012-08-10,0,35,1,1,1,0,0,2,0,2,0,119775.54,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2015-01-28,1050614,0,0,1,23,2012-08-10,0,35,1,1,0,0,0,1,0,2,1,99950.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:
# Remove unnecessary variables from memory for memory efficiency
del df, df_lag

In [44]:
# Replace with 0 in case the product information for the last month does not exist.
#for prod in prods:
#    prev = prod + '_prev'
#    df_trn[prev].fillna(0, inplace=True)

In [45]:
# Replace with 0 in case the product information for the last month does not exist.
for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)

In [46]:
df_trn.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
0,2015-01-28,1375586,0,0,0,35,2015-01-12,0,6,1,1,0,0,0,0,0,0,1,87218.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2015,1,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,2015-01-28,1050611,0,0,1,23,2012-08-10,0,35,1,1,1,0,1,1,0,1,0,35548.74,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,2015-01-28,1050612,0,0,1,23,2012-08-10,0,35,1,1,1,0,0,1,0,1,0,122179.11,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,2015-01-28,1050613,0,0,0,22,2012-08-10,0,35,1,1,1,0,0,2,0,2,0,119775.54,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,2015-01-28,1050614,0,0,1,23,2012-08-10,0,35,1,1,0,0,0,1,0,2,1,99950.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [47]:
df_trn.fillna(-99, inplace=True)

In [48]:
df_trn.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
0,2015-01-28,1375586,0,0,0,35,2015-01-12,0,6,1,1,0,0,0,0,0,0,1,87218.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2015,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
1,2015-01-28,1050611,0,0,1,23,2012-08-10,0,35,1,1,1,0,1,1,0,1,0,35548.74,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
2,2015-01-28,1050612,0,0,1,23,2012-08-10,0,35,1,1,1,0,0,1,0,1,0,122179.11,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
3,2015-01-28,1050613,0,0,0,22,2012-08-10,0,35,1,1,1,0,0,2,0,2,0,119775.54,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
4,2015-01-28,1050614,0,0,1,23,2012-08-10,0,35,1,1,0,0,0,1,0,2,1,99950.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


In [49]:
na_count={}

In [50]:
for col in df_trn.columns:
    na_count[col]=df_trn[col].isnull().sum()

In [51]:
def getKeysByValue(dictOfElements, valueToFind):
    listOfKeys = list()
    listOfItems = dictOfElements.items()
    for item  in listOfItems:
        if item[1] > valueToFind:
            listOfKeys.append(item[0])
    return  listOfKeys

In [52]:
listOfKeys = getKeysByValue(na_count, 0)

In [53]:
len(listOfKeys)

0

In [54]:
for i in listOfKeys:
    df_trn[i]=df_trn[i].fillna(-99)

In [55]:
na_count1={}

In [56]:
for col in df_trn.columns:
    na_count1[col]=df_trn[col].isnull().sum()

In [57]:

# Add the lag-1 variable.
features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]
#features += [prod + '_year' for prod in prods]

### After Baseline model, various feature engineering added

In [58]:
features

['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particu

In [59]:
pd.set_option('display.max_columns', None)
df_trn.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
0,2015-01-28,1375586,0,0,0,35,2015-01-12,0,6,1,1,0,0,0,0,0,0,1,87218.1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2015,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
1,2015-01-28,1050611,0,0,1,23,2012-08-10,0,35,1,1,1,0,1,1,0,1,0,35548.74,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
2,2015-01-28,1050612,0,0,1,23,2012-08-10,0,35,1,1,1,0,0,1,0,1,0,122179.11,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
3,2015-01-28,1050613,0,0,0,22,2012-08-10,0,35,1,1,1,0,0,2,0,2,0,119775.54,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
4,2015-01-28,1050614,0,0,1,23,2012-08-10,0,35,1,1,0,0,0,1,0,2,1,99950.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2012,1,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


-Cross validation

It is no exaggeration to say that cross-validation is the most important thing to achieve good results. It is very important to experiment with various ideas without limitation through the correct cross-validation process, and to check whether performance is improved.

A total of 1 year and 6 months worth of data (2015-01-28 ~ 2016-05-28) is provided as training data, and the test data to be predicted is future data (2016-06-28). In this case, it is common to separate the latest data (2016-05-28) into verification data and use the remaining data as training data even in the internal cross-validation process. In the baseline model, in order to simplify the model, data for a total of 4 months from 2016-01-28 to 2016-04-28 is used as training data, and 2016-05-28 data is used as verification data.

Separating data for cross validation
The part that is extracted to use only 2016 without using the entire training data may correspond to feature engineering.

In [60]:
df_trn['Month_status_date'].unique()

array(['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28',
       '2015-05-28', '2015-06-28', '2015-07-28', '2015-08-28',
       '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28',
       '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28',
       '2016-05-28', '2016-06-28'], dtype=object)

In [61]:
tst_final = df_trn.loc[df_trn['Month_status_date'] == '2016-06-28']

In [62]:
tst_final['Customer_ID'].nunique()

929615

In [63]:
## model training
# Separate data for training and testing for learning.
# Only 2016-01-28 ~ 2016-04-28 data is used for learning, and 2016-05-28 data is used for verification.
use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn_final = df_trn[df_trn['Month_status_date'].isin(use_dates)]

In [64]:
trn_final.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
7628152,2016-01-28,1432296,0,0,1,20,2015-08-07,1,5,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628153,2016-01-28,1432294,0,0,0,25,2015-08-07,1,5,1,1,1,0,0,157,0,33,1,184449.27,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628154,2016-01-28,1432292,0,0,1,23,2015-08-07,1,5,1,1,1,0,1,157,0,19,0,42478.02,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628155,2016-01-28,1432297,0,0,0,20,2015-08-07,1,5,1,1,1,0,0,157,0,3,0,89482.35,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628156,2016-01-28,1432280,0,0,1,20,2015-08-07,1,5,1,1,1,0,0,157,0,3,0,33026.76,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


In [65]:
trn_final[trn_final['Customer_ID'] == 1432296]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
7628152,2016-01-28,1432296,0,0,1,20,2015-08-07,1,5,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
8507330,2016-02-28,1432296,0,0,1,20,2015-08-07,0,6,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,14,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
9014611,2016-03-28,1432296,0,0,1,20,2015-08-07,0,7,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,15,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
10287364,2016-04-28,1432296,0,0,1,20,2015-08-07,0,8,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,16,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
10864621,2016-05-28,1432296,0,0,1,20,2015-08-07,0,9,1,1,1,0,1,157,0,19,0,34745.28,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,17,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


In [66]:
del df_trn

In [67]:
prods

['Saving_account',
 'Guarantees',
 'Cur_account',
 'Derivative_account',
 'Payroll_account',
 'Junior_account',
 'Particular_acct1',
 'Particular_acct2',
 'Particular_acct3',
 'Short_term_deposites',
 'Med_term_deposites',
 'Long_term_deposites',
 'e-account',
 'Funds',
 'Mortgage',
 'Pension',
 'Loans',
 'Taxes',
 'Credit_card',
 'Securities',
 'Home_account',
 'Payroll',
 'Pensions',
 'Direct_debit']

In [68]:
# Extract only the number of new purchases from the training data.
X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn_final[(trn_final[prod] == 1) & (trn_final[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)

In [69]:
prX

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
7628189,2016-01-28,1432276,0,0,0,20,2015-08-07,1,5,1,1,0,0,0,157,0,37,1,67813.86,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628190,2016-01-28,1432241,0,0,0,51,2015-08-07,1,5,1,1,0,0,1,37,0,18,1,45612.57,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628196,2016-01-28,1432234,0,0,0,23,2015-08-07,1,5,1,1,0,0,0,37,0,9,1,160099.56,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628202,2016-01-28,1432228,0,0,1,76,2015-08-07,1,5,1,1,0,0,1,12,0,9,1,87929.88,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7628203,2016-01-28,1432226,0,0,0,46,2015-08-07,1,5,1,1,0,0,0,37,0,0,1,95102.61,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11090837,2016-05-28,1167153,0,0,1,22,2013-08-19,0,33,1,1,0,0,0,1,0,9,1,118003.41,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,22.0,2013-08-19,0.0,23.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,9.0,1.0,118003.41,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090888,2016-05-28,1167058,0,0,1,70,2013-08-16,0,33,1,1,0,0,0,4,0,9,1,131214.48,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,69.0,2013-08-16,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,9.0,1.0,131214.48,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090895,2016-05-28,1167040,0,0,1,41,2013-08-16,0,33,1,1,0,0,0,4,0,46,1,82447.02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,41.0,2013-08-16,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,46.0,1.0,82447.02,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090914,2016-05-28,1167089,0,0,1,29,2013-08-16,0,33,1,1,0,0,0,4,0,19,1,77847.87,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,8,2013,17,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


In [70]:
prX[prX['Customer_ID'] == 1166355]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev


In [71]:
X

[        Month_status_date  Customer_ID  Employee_Index  Customer_country  Sex  \
 7684553        2016-01-28        53273               0                 0    1   
 
          Age   Join_date  New_customer  Relnshp_Mnths  Relnshp_flag  \
 7684553   43  2000-04-06             0            189             1   
 
          Cust_type_beg_Mth  Cust_Reln_type_beg_mth  Residence_flag  \
 7684553                  1                       0               0   
 
          Forigner_flag  Channel_when_joined  Deceased_flag  Address_detail  \
 7684553              0                    5              0               9   
 
          Activity_flag  Gross_household_income  Segment  Saving_account  \
 7684553              1                121920.3        0               1   
 
          Guarantees  Cur_account  Derivative_account  Payroll_account  \
 7684553           0            1                   0                0   
 
          Junior_account  Particular_acct1  Particular_acct2  Particular_acct3  

In [72]:
len(X)

24

In [73]:
Y

[array([0], dtype=int8),
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int8),
 array([2, 2, 2, ..., 2, 2, 2], dtype=int8),
 array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3], dtype=int8),
 array([4, 4, 4, ..., 4, 4, 4], dtype=int8),
 array([5, 5, 5, ..., 5, 5, 5], dtype=int8),
 arra

In [74]:
XY = pd.concat(X)

In [75]:
XY

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
7684553,2016-01-28,53273,0,0,1,43,2000-04-06,0,189,1,1,0,0,0,5,0,9,1,121920.30,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,4,2000,13,2015-01-28,0.0,0.0,1.0,43.0,2000-04-06,0.0,183.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,9.0,1.0,121920.30,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,2000.0
7658069,2016-01-28,1474324,0,0,1,43,2015-10-09,1,3,1,1,0,0,0,3,0,18,1,139070.97,3,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,10,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0
7715292,2016-01-28,118972,0,0,1,45,2000-05-23,0,188,1,1,0,0,0,5,0,18,1,172178.94,3,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,1,1,1,5,2000,13,2015-01-28,0.0,0.0,1.0,44.0,2000-05-23,0.0,182.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,172178.94,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,2000.0
7785416,2016-01-28,179350,0,0,0,43,2000-06-21,0,187,1,1,0,0,0,5,0,18,1,190182.24,3,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,6,2000,13,2015-01-28,0.0,0.0,0.0,43.0,2000-06-21,0.0,181.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,190182.24,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,2000.0
8014299,2016-01-28,906299,0,0,1,36,2011-03-09,0,58,1,1,0,0,0,4,0,18,1,84904.41,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,3,2011,13,2015-01-28,0.0,0.0,1.0,36.0,2011-03-09,0.0,52.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,18.0,1.0,84904.41,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2011.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11090837,2016-05-28,1167153,0,0,1,22,2013-08-19,0,33,1,1,0,0,0,1,0,9,1,118003.41,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,22.0,2013-08-19,0.0,23.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,9.0,1.0,118003.41,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090888,2016-05-28,1167058,0,0,1,70,2013-08-16,0,33,1,1,0,0,0,4,0,9,1,131214.48,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,69.0,2013-08-16,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,9.0,1.0,131214.48,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090895,2016-05-28,1167040,0,0,1,41,2013-08-16,0,33,1,1,0,0,0,4,0,46,1,82447.02,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,41.0,2013-08-16,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,46.0,1.0,82447.02,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090914,2016-05-28,1167089,0,0,1,29,2013-08-16,0,33,1,1,0,0,0,4,0,19,1,77847.87,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,8,2013,17,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.00,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0


In [76]:
XY.shape

(1141331, 92)

In [77]:
XY[XY['Customer_ID'] == 1166385]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
8835936,2016-02-28,1166385,0,0,1,65,2013-08-14,0,30,1,1,0,0,0,4,0,38,1,79016.37,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,14,2015-02-28,0.0,0.0,1.0,64.0,2013-08-14,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,38.0,1.0,79016.37,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11090455,2016-05-28,1166385,0,0,1,65,2013-08-14,0,33,1,1,0,0,0,4,0,38,1,79016.37,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2013,17,2015-05-28,0.0,0.0,1.0,64.0,2013-08-14,0.0,23.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,38.0,1.0,79016.37,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0


In [78]:
Y = np.hstack(Y)

In [79]:
Y

array([ 0,  1,  1, ..., 23, 23, 23], dtype=int8)

In [80]:

XY['y'] = Y

# Separate training and verification data. 
vld_date = '2016-05-28'
XY_trn = XY[XY['Month_status_date'] != vld_date]
XY_vld = XY[XY['Month_status_date'] == vld_date]

In [81]:
XY_trn.head(10)

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev,y
7684553,2016-01-28,53273,0,0,1,43,2000-04-06,0,189,1,1,0,0,0,5,0,9,1,121920.3,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,4,2000,13,2015-01-28,0.0,0.0,1.0,43.0,2000-04-06,0.0,183.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,9.0,1.0,121920.3,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,2000.0,0
7658069,2016-01-28,1474324,0,0,1,43,2015-10-09,1,3,1,1,0,0,0,3,0,18,1,139070.97,3,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,1,10,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,1
7715292,2016-01-28,118972,0,0,1,45,2000-05-23,0,188,1,1,0,0,0,5,0,18,1,172178.94,3,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,1,1,1,5,2000,13,2015-01-28,0.0,0.0,1.0,44.0,2000-05-23,0.0,182.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,172178.94,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,2000.0,1
7785416,2016-01-28,179350,0,0,0,43,2000-06-21,0,187,1,1,0,0,0,5,0,18,1,190182.24,3,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,1,0,6,2000,13,2015-01-28,0.0,0.0,0.0,43.0,2000-06-21,0.0,181.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,190182.24,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,2000.0,1
8014299,2016-01-28,906299,0,0,1,36,2011-03-09,0,58,1,1,0,0,0,4,0,18,1,84904.41,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,3,2011,13,2015-01-28,0.0,0.0,1.0,36.0,2011-03-09,0.0,52.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,18.0,1.0,84904.41,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2011.0,1
8430395,2016-02-28,1474324,0,0,1,43,2015-10-09,1,4,1,1,0,0,0,3,0,18,1,139070.97,3,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,10,2015,14,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,1
8633500,2016-02-28,118972,0,0,1,45,2000-05-23,0,189,1,1,0,0,0,5,0,18,1,172178.94,3,0,1,0,0,1,0,0,1,0,0,0,0,1,1,0,0,0,1,1,1,0,1,1,1,5,2000,14,2015-02-28,0.0,0.0,1.0,44.0,2000-05-23,0.0,182.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,172178.94,3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,5.0,2000.0,1
8978820,2016-02-28,906299,0,0,1,36,2011-03-09,0,59,1,1,0,0,0,4,0,18,1,84904.41,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,3,2011,14,2015-02-28,0.0,0.0,1.0,36.0,2011-03-09,0.0,52.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,18.0,1.0,84904.41,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2011.0,1
9074987,2016-03-28,1474324,0,0,1,44,2015-10-09,1,5,1,1,0,0,0,3,0,18,1,139070.97,3,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,10,2015,15,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,1
9423720,2016-03-28,906299,0,0,1,36,2011-03-09,0,60,1,1,0,0,0,4,0,18,1,84904.41,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,1,3,2011,15,2015-03-28,0.0,0.0,1.0,36.0,2011-03-09,0.0,52.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,18.0,1.0,84904.41,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,3.0,2011.0,1


In [82]:
XY_trn[XY_trn['Customer_ID'] == 1432311]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev,y
7628180,2016-01-28,1432311,0,0,1,26,2015-08-07,1,5,1,1,1,0,0,157,0,36,1,62330.97,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2
8507321,2016-02-28,1432311,0,0,1,26,2015-08-07,0,6,1,1,1,0,0,157,0,36,0,62330.97,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,14,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2
9014642,2016-03-28,1432311,0,0,1,26,2015-08-07,0,7,1,1,0,0,0,157,0,36,0,62330.97,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,15,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2
10287374,2016-04-28,1432311,0,0,1,26,2015-08-07,0,8,1,1,0,0,0,157,0,36,0,62330.97,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,16,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2


In [83]:
XY_trn[XY_trn['Customer_ID'] == 1432622]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev,y
7628745,2016-01-28,1432622,0,0,1,21,2015-08-10,1,5,1,1,0,0,0,157,0,21,1,537793.23,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2
7628745,2016-01-28,1432622,0,0,1,21,2015-08-10,1,5,1,1,0,0,0,157,0,21,1,537793.23,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,13,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,23
8507961,2016-02-28,1432622,0,0,1,21,2015-08-10,0,6,1,1,1,0,0,157,0,21,1,537793.23,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,8,2015,14,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,23


In [84]:
XY_vld[XY_vld['Customer_ID'] == 1432311]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev,y
10864612,2016-05-28,1432311,0,0,1,26,2015-08-07,0,9,1,1,0,0,0,157,0,36,1,62330.97,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2015,17,-99,-99.0,-99.0,-99.0,-99.0,-99,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-99.0,-99.0,2



-max_depth
    -It means the maximum depth of the tree model. The higher the value, the more complex the tree model is created, and it may cause overfitting.
    
    
-eta
    -It is the same concept as the learning rate in deep learning. It has a value between 0 and 1, and if the value is too high, learning may not work well.
      Conversely, if the value is too low, learning can be slow.
    
    
-colsample_bytree
    -When constructing the tree, it is the rate at which the variable is sampled from the training data. All trees learn only a part of the entire variable to compensate for each other's weaknesses.
      will be. Usually a value of 0.6~0.9 is used.
    
    
-colsample_bylevel
    -This is the ratio of sampling the variables of the training data for each level of the tree. Usually a value of 0.6~0.9 is used.

Given the efficiency of investment versus time, it is better to spend more time engineering features rather than tuning parameters. Rather than a single perfect model obtained through a variable level of feature engineering and an enormous level of parameter tuning, a model with a moderate level of parameter tuning and a model with high quality variables obtained by investing a lot of time in feature engineering This universally looks better performance.

In [85]:
features

['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particu

In [86]:
#pip install < xgboost-1.3.0_SNAPSHOT+0e2d5669f660c712a91ca91fe2c40dfba901dee5-py3-none
#-macosx_10_13_x86_64.macosx_10_14_x86_64.macosx_10_15_x86_64.whl>

In [87]:
#Set the XGBoost model parameter.
param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    'gpu_id':0,
    'tree_method':'gpu_hist',
    }


# Convert training and verification data into XGBoost format.
X_trn = XY_trn[['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particular_acct3_prev',
 'Short_term_deposites_prev',
 'Med_term_deposites_prev',
 'Long_term_deposites_prev',
 'e-account_prev',
 'Funds_prev',
 'Mortgage_prev',
 'Pension_prev',
 'Loans_prev',
 'Taxes_prev',
 'Credit_card_prev',
 'Securities_prev',
 'Home_account_prev',
 'Payroll_prev',
 'Pensions_prev',
 'Direct_debit_prev']].values
Y_trn = XY_trn['y'].values
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)

X_vld = XY_vld[['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particular_acct3_prev',
 'Short_term_deposites_prev',
 'Med_term_deposites_prev',
 'Long_term_deposites_prev',
 'e-account_prev',
 'Funds_prev',
 'Mortgage_prev',
 'Pension_prev',
 'Loans_prev',
 'Taxes_prev',
 'Credit_card_prev',
 'Securities_prev',
 'Home_account_prev',
 'Payroll_prev',
 'Pensions_prev',
 'Direct_debit_prev']].values
Y_vld = XY_vld['y'].values
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)

# Train XGBoost model with training data!
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)

# Save the trained model.
import pickle
model.save_model('New_improved_Santander.bin')
best_ntree_limit = model.best_ntree_limit

[0]	train-mlogloss:2.70498	eval-mlogloss:2.71891
Multiple eval metrics have been passed: 'eval-mlogloss' will be used for early stopping.

Will train until eval-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:2.49065	eval-mlogloss:2.50902
[2]	train-mlogloss:2.32903	eval-mlogloss:2.35137
[3]	train-mlogloss:2.2049	eval-mlogloss:2.22893
[4]	train-mlogloss:2.10007	eval-mlogloss:2.12598
[5]	train-mlogloss:2.01433	eval-mlogloss:2.04123
[6]	train-mlogloss:1.94124	eval-mlogloss:1.96875
[7]	train-mlogloss:1.87813	eval-mlogloss:1.90662
[8]	train-mlogloss:1.81979	eval-mlogloss:1.84888
[9]	train-mlogloss:1.76889	eval-mlogloss:1.79863
[10]	train-mlogloss:1.72302	eval-mlogloss:1.75311
[11]	train-mlogloss:1.68153	eval-mlogloss:1.71224
[12]	train-mlogloss:1.64442	eval-mlogloss:1.67546
[13]	train-mlogloss:1.61111	eval-mlogloss:1.64253
[14]	train-mlogloss:1.58073	eval-mlogloss:1.61236
[15]	train-mlogloss:1.553	eval-mlogloss:1.58487
[16]	train-mlogloss:1.52819	eval-mlogloss:1.56041
[17]	train-m

[161]	train-mlogloss:1.16281	eval-mlogloss:1.20745
[162]	train-mlogloss:1.16238	eval-mlogloss:1.20715
[163]	train-mlogloss:1.16205	eval-mlogloss:1.20692
[164]	train-mlogloss:1.16167	eval-mlogloss:1.20661
[165]	train-mlogloss:1.16117	eval-mlogloss:1.20624
[166]	train-mlogloss:1.1608	eval-mlogloss:1.20598
[167]	train-mlogloss:1.16036	eval-mlogloss:1.20566
[168]	train-mlogloss:1.15977	eval-mlogloss:1.20521
[169]	train-mlogloss:1.15939	eval-mlogloss:1.20493
[170]	train-mlogloss:1.15889	eval-mlogloss:1.20458
[171]	train-mlogloss:1.15838	eval-mlogloss:1.20419
[172]	train-mlogloss:1.158	eval-mlogloss:1.20392
[173]	train-mlogloss:1.15755	eval-mlogloss:1.20358
[174]	train-mlogloss:1.15714	eval-mlogloss:1.20329
[175]	train-mlogloss:1.15678	eval-mlogloss:1.20303
[176]	train-mlogloss:1.15634	eval-mlogloss:1.2027
[177]	train-mlogloss:1.15592	eval-mlogloss:1.20239
[178]	train-mlogloss:1.15556	eval-mlogloss:1.20212
[179]	train-mlogloss:1.15508	eval-mlogloss:1.20178
[180]	train-mlogloss:1.15461	eval-m

[323]	train-mlogloss:1.099	eval-mlogloss:1.16126
[324]	train-mlogloss:1.09862	eval-mlogloss:1.161
[325]	train-mlogloss:1.09826	eval-mlogloss:1.16074
[326]	train-mlogloss:1.09787	eval-mlogloss:1.16046
[327]	train-mlogloss:1.09758	eval-mlogloss:1.16024
[328]	train-mlogloss:1.09723	eval-mlogloss:1.15998
[329]	train-mlogloss:1.09691	eval-mlogloss:1.15976
[330]	train-mlogloss:1.09655	eval-mlogloss:1.15949
[331]	train-mlogloss:1.0961	eval-mlogloss:1.15914
[332]	train-mlogloss:1.09566	eval-mlogloss:1.15882
[333]	train-mlogloss:1.09533	eval-mlogloss:1.15858
[334]	train-mlogloss:1.09497	eval-mlogloss:1.15831
[335]	train-mlogloss:1.09463	eval-mlogloss:1.15808
[336]	train-mlogloss:1.09427	eval-mlogloss:1.15782
[337]	train-mlogloss:1.0939	eval-mlogloss:1.15755
[338]	train-mlogloss:1.09357	eval-mlogloss:1.15732
[339]	train-mlogloss:1.09328	eval-mlogloss:1.1571
[340]	train-mlogloss:1.09294	eval-mlogloss:1.15685
[341]	train-mlogloss:1.09252	eval-mlogloss:1.15657
[342]	train-mlogloss:1.09204	eval-mlog

[485]	train-mlogloss:1.04762	eval-mlogloss:1.12433
[486]	train-mlogloss:1.04731	eval-mlogloss:1.12411
[487]	train-mlogloss:1.04709	eval-mlogloss:1.12396
[488]	train-mlogloss:1.04676	eval-mlogloss:1.12371
[489]	train-mlogloss:1.04643	eval-mlogloss:1.12348
[490]	train-mlogloss:1.04613	eval-mlogloss:1.12325
[491]	train-mlogloss:1.04591	eval-mlogloss:1.12311
[492]	train-mlogloss:1.04562	eval-mlogloss:1.1229
[493]	train-mlogloss:1.04535	eval-mlogloss:1.1227
[494]	train-mlogloss:1.0451	eval-mlogloss:1.12253
[495]	train-mlogloss:1.04489	eval-mlogloss:1.12237
[496]	train-mlogloss:1.04463	eval-mlogloss:1.12218
[497]	train-mlogloss:1.04437	eval-mlogloss:1.12199
[498]	train-mlogloss:1.04419	eval-mlogloss:1.12187
[499]	train-mlogloss:1.04393	eval-mlogloss:1.12168
[500]	train-mlogloss:1.04366	eval-mlogloss:1.12149
[501]	train-mlogloss:1.04342	eval-mlogloss:1.12131
[502]	train-mlogloss:1.0432	eval-mlogloss:1.12118
[503]	train-mlogloss:1.04297	eval-mlogloss:1.12102
[504]	train-mlogloss:1.04269	eval-m

[647]	train-mlogloss:1.00529	eval-mlogloss:1.09419
[648]	train-mlogloss:1.005	eval-mlogloss:1.09397
[649]	train-mlogloss:1.00472	eval-mlogloss:1.09377
[650]	train-mlogloss:1.00442	eval-mlogloss:1.09355
[651]	train-mlogloss:1.00413	eval-mlogloss:1.09336
[652]	train-mlogloss:1.00384	eval-mlogloss:1.09314
[653]	train-mlogloss:1.0036	eval-mlogloss:1.09298
[654]	train-mlogloss:1.00338	eval-mlogloss:1.09284
[655]	train-mlogloss:1.00316	eval-mlogloss:1.09268
[656]	train-mlogloss:1.00287	eval-mlogloss:1.09248
[657]	train-mlogloss:1.00261	eval-mlogloss:1.0923
[658]	train-mlogloss:1.00236	eval-mlogloss:1.09212
[659]	train-mlogloss:1.00205	eval-mlogloss:1.0919
[660]	train-mlogloss:1.00182	eval-mlogloss:1.09174
[661]	train-mlogloss:1.00154	eval-mlogloss:1.09155
[662]	train-mlogloss:1.00129	eval-mlogloss:1.09137
[663]	train-mlogloss:1.00111	eval-mlogloss:1.09124
[664]	train-mlogloss:1.00088	eval-mlogloss:1.09108
[665]	train-mlogloss:1.0006	eval-mlogloss:1.09088
[666]	train-mlogloss:1.00039	eval-mlo

[806]	train-mlogloss:0.969633	eval-mlogloss:1.0692
[807]	train-mlogloss:0.969474	eval-mlogloss:1.06907
[808]	train-mlogloss:0.969235	eval-mlogloss:1.06892
[809]	train-mlogloss:0.96906	eval-mlogloss:1.06879
[810]	train-mlogloss:0.968902	eval-mlogloss:1.06869
[811]	train-mlogloss:0.968712	eval-mlogloss:1.06855
[812]	train-mlogloss:0.968482	eval-mlogloss:1.06839
[813]	train-mlogloss:0.968256	eval-mlogloss:1.06823
[814]	train-mlogloss:0.968072	eval-mlogloss:1.0681
[815]	train-mlogloss:0.967926	eval-mlogloss:1.06799
[816]	train-mlogloss:0.967711	eval-mlogloss:1.06784
[817]	train-mlogloss:0.967527	eval-mlogloss:1.06772
[818]	train-mlogloss:0.967346	eval-mlogloss:1.0676
[819]	train-mlogloss:0.967178	eval-mlogloss:1.06748
[820]	train-mlogloss:0.966972	eval-mlogloss:1.06734
[821]	train-mlogloss:0.966779	eval-mlogloss:1.06721
[822]	train-mlogloss:0.966648	eval-mlogloss:1.06711
[823]	train-mlogloss:0.9665	eval-mlogloss:1.067
[824]	train-mlogloss:0.966257	eval-mlogloss:1.06684
[825]	train-mlogloss

[965]	train-mlogloss:0.94088	eval-mlogloss:1.0493
[966]	train-mlogloss:0.940659	eval-mlogloss:1.04914
[967]	train-mlogloss:0.94042	eval-mlogloss:1.04897
[968]	train-mlogloss:0.940181	eval-mlogloss:1.0488
[969]	train-mlogloss:0.940044	eval-mlogloss:1.0487
[970]	train-mlogloss:0.939869	eval-mlogloss:1.04858
[971]	train-mlogloss:0.939634	eval-mlogloss:1.04842
[972]	train-mlogloss:0.939441	eval-mlogloss:1.04829
[973]	train-mlogloss:0.939248	eval-mlogloss:1.04816
[974]	train-mlogloss:0.939034	eval-mlogloss:1.048
[975]	train-mlogloss:0.938891	eval-mlogloss:1.0479
[976]	train-mlogloss:0.938659	eval-mlogloss:1.04773
[977]	train-mlogloss:0.9385	eval-mlogloss:1.04763
[978]	train-mlogloss:0.93834	eval-mlogloss:1.0475
[979]	train-mlogloss:0.938175	eval-mlogloss:1.04739
[980]	train-mlogloss:0.938	eval-mlogloss:1.04727
[981]	train-mlogloss:0.937797	eval-mlogloss:1.04713
[982]	train-mlogloss:0.937633	eval-mlogloss:1.04702
[983]	train-mlogloss:0.937464	eval-mlogloss:1.0469
[984]	train-mlogloss:0.93729

Stopping. Best iteration:
         train-mlogloss:0.93429	eval-mlogloss:1.04467

In the cross-validation, the performance level is checked using MAP@7 , the evaluation scale of this competition . It is very important to have the rating scale actually used in cross validation. This is because no matter how much time and effort it takes to optimize, it is meaningless in the end if the evaluation scale used is not improved.

On the MAP@7 rating scale, the highest score may fluctuate depending on the data. The highest score that can be obtained from the validation data of the baseline model is 0.042663. In the code below, if mapk(add_vld_list, add_vld)list, 7, 0.0) is calculated based on the actual correct answer value of the verification data, the above value is obtained. The reason the score is lower than 1 is that all customers in the verification data did not make new purchases. For example, assuming that only 10 out of 100 customers made a new purchase, we would get a 10% MAP@7 score in the end even if we correctly predict those 10 . Therefore, we want to evaluate the performance of the learning model by considering the MAP@7 highest score of the verification data .

In [88]:

# This is a preparatory work for the MAP@7 evaluation scale.
# Extract customer identification number.
vld = trn_final[trn_final['Month_status_date'] == vld_date]
ncodpers_vld = vld['Customer_ID'].values

# Get a new purchase from the verification data.
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev] 
for prod in prods:
    add_vld = vld[prod + '_add' ].values
add_vld_list = [list() for i in range(len(ncodpers_vld))]

# Save the new purchase answer value for each customer in add_vld_list, and store the total count in count_vld.
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[[ncodper, prod]].all() > 0:   ##Here please check.....might be some mistake...!!
            add_vld_list[ncodper].append(prod)
            count_vld += 1
            

# Find the highest point of MAP@7 that can be obtained from the verification data in advance. (0.042663)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))

# Calculate the predicted value for the verification data.
X_vld = vld[['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particular_acct3_prev',
 'Short_term_deposites_prev',
 'Med_term_deposites_prev',
 'Long_term_deposites_prev',
 'e-account_prev',
 'Funds_prev',
 'Mortgage_prev',
 'Pension_prev',
 'Loans_prev',
 'Taxes_prev',
 'Credit_card_prev',
 'Securities_prev',
 'Home_account_prev',
 'Payroll_prev',
 'Pensions_prev',
 'Direct_debit_prev']].values
Y_vld = XY_trn['y'].values
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)


# Since the product you owned last month cannot be purchased new, subtract 1 from the probability value in advance.
preds_vld = preds_vld - vld[['Saving_account',
 'Guarantees',
 'Cur_account',
 'Derivative_account',
 'Payroll_account',
 'Junior_account',
 'Particular_acct1',
 'Particular_acct2',
 'Particular_acct3',
 'Short_term_deposites',
 'Med_term_deposites',
 'Long_term_deposites',
 'e-account',
 'Funds',
 'Mortgage',
 'Pension',
 'Loans',
 'Taxes',
 'Credit_card',
 'Securities',
 'Home_account',
 'Payroll',
 'Pensions',
 'Direct_debit']].values

    
# Extract the top 7 prediction data predictions.
result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])
    
# Calculate the MAP@7 score from the verification data. (0.036466)
print(mapk(add_vld_list, result_vld, 7, 0.0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


0.0592457852324134
0.016071841326493256


-Predict test data and upload Kaggle

In cross-validation, part of the training data was cut out and used as verification data. It may be a bit cumbersome, but in order to achieve even a little good performance for the test data, the XGBoost model is retrained on the entire data combined with the training data and the verification data. As the parameters of the XGBoost model, the optimal parameters found through cross-validation are used, but the number of trees used in the XGBoost model is increased by the increased verification data.

For all training data, the feature importance of the trained model is output. You can check it through get_fscore(), which is supported by the XGBoost model. What is the most discriminating variable?

Create a submission file for Kaggle submission. According to the rules, the customer identification number (ncodpers) and the names of 7 product variables are entered directly into the submission file with spaces apart.

In [89]:
# Retrain the XGBoost model with full training data!
# Don't run this

X_all = XY[['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particular_acct3_prev',
 'Short_term_deposites_prev',
 'Med_term_deposites_prev',
 'Long_term_deposites_prev',
 'e-account_prev',
 'Funds_prev',
 'Mortgage_prev',
 'Pension_prev',
 'Loans_prev',
 'Taxes_prev',
 'Credit_card_prev',
 'Securities_prev',
 'Home_account_prev',
 'Payroll_prev',
 'Pensions_prev',
 'Direct_debit_prev']].values
Y_all = XY['y'].values
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]

# Increases the number of trees proportionally by the amount of data.
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))

# Retrain the XGBoost model!
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)

# Print the importance of the variable. Are the variables you were expecting to be on the top?
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)




[0]	train-mlogloss:2.70825
[1]	train-mlogloss:2.48608
[2]	train-mlogloss:2.32633
[3]	train-mlogloss:2.20363
[4]	train-mlogloss:2.1005
[5]	train-mlogloss:2.01502
[6]	train-mlogloss:1.93957
[7]	train-mlogloss:1.87634
[8]	train-mlogloss:1.81887
[9]	train-mlogloss:1.76895
[10]	train-mlogloss:1.72365
[11]	train-mlogloss:1.68305
[12]	train-mlogloss:1.64685
[13]	train-mlogloss:1.61372
[14]	train-mlogloss:1.58385
[15]	train-mlogloss:1.5568
[16]	train-mlogloss:1.53216
[17]	train-mlogloss:1.50942
[18]	train-mlogloss:1.48866
[19]	train-mlogloss:1.46945
[20]	train-mlogloss:1.45198
[21]	train-mlogloss:1.43575
[22]	train-mlogloss:1.42114
[23]	train-mlogloss:1.40779
[24]	train-mlogloss:1.39545
[25]	train-mlogloss:1.38384
[26]	train-mlogloss:1.37299
[27]	train-mlogloss:1.36288
[28]	train-mlogloss:1.35369
[29]	train-mlogloss:1.3452
[30]	train-mlogloss:1.33686
[31]	train-mlogloss:1.32927
[32]	train-mlogloss:1.32218
[33]	train-mlogloss:1.31576
[34]	train-mlogloss:1.30947
[35]	train-mlogloss:1.30375
[36]	

[287]	train-mlogloss:1.11705
[288]	train-mlogloss:1.11662
[289]	train-mlogloss:1.11631
[290]	train-mlogloss:1.11593
[291]	train-mlogloss:1.11557
[292]	train-mlogloss:1.11518
[293]	train-mlogloss:1.11482
[294]	train-mlogloss:1.11457
[295]	train-mlogloss:1.11432
[296]	train-mlogloss:1.11397
[297]	train-mlogloss:1.11364
[298]	train-mlogloss:1.11324
[299]	train-mlogloss:1.11282
[300]	train-mlogloss:1.11245
[301]	train-mlogloss:1.11208
[302]	train-mlogloss:1.11179
[303]	train-mlogloss:1.11145
[304]	train-mlogloss:1.11107
[305]	train-mlogloss:1.11078
[306]	train-mlogloss:1.11046
[307]	train-mlogloss:1.11009
[308]	train-mlogloss:1.10971
[309]	train-mlogloss:1.10945
[310]	train-mlogloss:1.10923
[311]	train-mlogloss:1.10885
[312]	train-mlogloss:1.10854
[313]	train-mlogloss:1.10807
[314]	train-mlogloss:1.10777
[315]	train-mlogloss:1.10742
[316]	train-mlogloss:1.10698
[317]	train-mlogloss:1.10665
[318]	train-mlogloss:1.10634
[319]	train-mlogloss:1.10608
[320]	train-mlogloss:1.10576
[321]	train-ml

[571]	train-mlogloss:1.03201
[572]	train-mlogloss:1.03175
[573]	train-mlogloss:1.0315
[574]	train-mlogloss:1.03122
[575]	train-mlogloss:1.03095
[576]	train-mlogloss:1.03062
[577]	train-mlogloss:1.03038
[578]	train-mlogloss:1.03013
[579]	train-mlogloss:1.02988
[580]	train-mlogloss:1.02962
[581]	train-mlogloss:1.02935
[582]	train-mlogloss:1.02915
[583]	train-mlogloss:1.02889
[584]	train-mlogloss:1.02868
[585]	train-mlogloss:1.02839
[586]	train-mlogloss:1.02815
[587]	train-mlogloss:1.02788
[588]	train-mlogloss:1.02764
[589]	train-mlogloss:1.02739
[590]	train-mlogloss:1.02716
[591]	train-mlogloss:1.02698
[592]	train-mlogloss:1.02679
[593]	train-mlogloss:1.02652
[594]	train-mlogloss:1.0262
[595]	train-mlogloss:1.02594
[596]	train-mlogloss:1.02563
[597]	train-mlogloss:1.02534
[598]	train-mlogloss:1.02505
[599]	train-mlogloss:1.02476
[600]	train-mlogloss:1.02449
[601]	train-mlogloss:1.02422
[602]	train-mlogloss:1.02395
[603]	train-mlogloss:1.02377
[604]	train-mlogloss:1.02352
[605]	train-mlog

[850]	train-mlogloss:0.969992
[851]	train-mlogloss:0.969786
[852]	train-mlogloss:0.969549
[853]	train-mlogloss:0.969327
[854]	train-mlogloss:0.969181
[855]	train-mlogloss:0.968998
[856]	train-mlogloss:0.968828
[857]	train-mlogloss:0.968638
[858]	train-mlogloss:0.968428
[859]	train-mlogloss:0.96822
[860]	train-mlogloss:0.968016
[861]	train-mlogloss:0.967813
[862]	train-mlogloss:0.967638
[863]	train-mlogloss:0.96743
[864]	train-mlogloss:0.967255
[865]	train-mlogloss:0.967049
[866]	train-mlogloss:0.966777
[867]	train-mlogloss:0.966551
[868]	train-mlogloss:0.96635
[869]	train-mlogloss:0.966168
[870]	train-mlogloss:0.965954
[871]	train-mlogloss:0.965753
[872]	train-mlogloss:0.965562
[873]	train-mlogloss:0.96537
[874]	train-mlogloss:0.965223
[875]	train-mlogloss:0.96503
[876]	train-mlogloss:0.964788
[877]	train-mlogloss:0.964604
[878]	train-mlogloss:0.964404
[879]	train-mlogloss:0.964264
[880]	train-mlogloss:0.964065
[881]	train-mlogloss:0.963828
[882]	train-mlogloss:0.96368
[883]	train-mlog

[1121]	train-mlogloss:0.924556
[1122]	train-mlogloss:0.924433
[1123]	train-mlogloss:0.924284
[1124]	train-mlogloss:0.924124
[1125]	train-mlogloss:0.924006
[1126]	train-mlogloss:0.923865
[1127]	train-mlogloss:0.923673
[1128]	train-mlogloss:0.923558
[1129]	train-mlogloss:0.923413
[1130]	train-mlogloss:0.923232
[1131]	train-mlogloss:0.923091
[1132]	train-mlogloss:0.922916
[1133]	train-mlogloss:0.922797
[1134]	train-mlogloss:0.922631
[1135]	train-mlogloss:0.922466
[1136]	train-mlogloss:0.922303
[1137]	train-mlogloss:0.922132
[1138]	train-mlogloss:0.921977
[1139]	train-mlogloss:0.921823
[1140]	train-mlogloss:0.921698
[1141]	train-mlogloss:0.921569
[1142]	train-mlogloss:0.921441
[1143]	train-mlogloss:0.921287
[1144]	train-mlogloss:0.921161
[1145]	train-mlogloss:0.921036
[1146]	train-mlogloss:0.920902
[1147]	train-mlogloss:0.920776
[1148]	train-mlogloss:0.92063
[1149]	train-mlogloss:0.920491
[1150]	train-mlogloss:0.920392
[1151]	train-mlogloss:0.920257
[1152]	train-mlogloss:0.920076
[1153]	tr

In [90]:
len(prods)

24

In [91]:
tst_final.head()

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_detail,Activity_flag,Gross_household_income,Segment,Saving_account,Guarantees,Cur_account,Derivative_account,Payroll_account,Junior_account,Particular_acct1,Particular_acct2,Particular_acct3,Short_term_deposites,Med_term_deposites,Long_term_deposites,e-account,Funds,Mortgage,Pension,Loans,Taxes,Credit_card,Securities,Home_account,Payroll,Pensions,Direct_debit,Join_date_month,Join_date_year,int_date,Month_status_date_prev,Employee_Index_prev,Customer_country_prev,Sex_prev,Age_prev,Join_date_prev,New_customer_prev,Relnshp_Mnths_prev,Relnshp_flag_prev,Cust_type_beg_Mth_prev,Cust_Reln_type_beg_mth_prev,Residence_flag_prev,Forigner_flag_prev,Channel_when_joined_prev,Deceased_flag_prev,Address_detail_prev,Activity_flag_prev,Gross_household_income_prev,Segment_prev,Saving_account_prev,Guarantees_prev,Cur_account_prev,Derivative_account_prev,Payroll_account_prev,Junior_account_prev,Particular_acct1_prev,Particular_acct2_prev,Particular_acct3_prev,Short_term_deposites_prev,Med_term_deposites_prev,Long_term_deposites_prev,e-account_prev,Funds_prev,Mortgage_prev,Pension_prev,Loans_prev,Taxes_prev,Credit_card_prev,Securities_prev,Home_account_prev,Payroll_prev,Pensions_prev,Direct_debit_prev,Join_date_month_prev,Join_date_year_prev
11091070,2016-06-28,15889,3,0,1,56,1995-01-16,0,256,1,1,0,0,0,5,0,18,1,326124.9,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1995,18,2015-06-28,3.0,0.0,1.0,56.0,1995-01-16,0.0,245.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,18.0,1.0,326124.9,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1995.0
11091071,2016-06-28,1170544,0,0,0,36,2013-08-28,0,34,1,1,1,0,0,5,0,19,0,67337.88,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2013,18,2015-06-28,0.0,0.0,0.0,35.0,2013-08-28,0.0,23.0,1.0,1.0,0.0,0.0,0.0,5.0,0.0,19.0,0.0,67526.28,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11091072,2016-06-28,1170545,0,0,1,22,2013-08-28,0,34,1,1,0,0,0,1,0,28,1,97392.27,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2013,18,2015-06-28,0.0,0.0,1.0,21.0,2013-08-28,0.0,23.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,28.0,0.0,97689.3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11091073,2016-06-28,1170547,0,0,0,22,2013-08-28,0,34,1,1,1,0,0,1,0,9,0,148402.98,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2013,18,2015-06-28,0.0,0.0,0.0,21.0,2013-08-28,0.0,23.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,9.0,0.0,148402.98,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0
11091074,2016-06-28,1170548,0,0,0,22,2013-08-28,0,34,1,1,1,0,0,1,0,23,0,106885.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,2013,18,2015-06-28,0.0,0.0,0.0,21.0,2013-08-28,0.0,23.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,23.0,0.0,106885.8,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2013.0


In [92]:
# Calculate predicted values for test data for Kaggle submission.
X_tst = tst_final[['Employee_Index',
 'Customer_country',
 'Sex',
 'Cust_Reln_type_beg_mth',
 'Residence_flag',
 'Forigner_flag',
 'Channel_when_joined',
 'Deceased_flag',
 'Address_detail',
 'Segment',
 'Age',
 'Relnshp_Mnths',
 'Gross_household_income',
 'New_customer',
 'Relnshp_flag',
 'Cust_type_beg_Mth',
 'Activity_flag',
 'Join_date_month',
 'Join_date_year',
 'Employee_Index_prev',
 'Customer_country_prev',
 'Sex_prev',
 'Cust_Reln_type_beg_mth_prev',
 'Residence_flag_prev',
 'Forigner_flag_prev',
 'Channel_when_joined_prev',
 'Deceased_flag_prev',
 'Address_detail_prev',
 'Segment_prev',
 'Age_prev',
 'Relnshp_Mnths_prev',
 'Gross_household_income_prev',
 'New_customer_prev',
 'Relnshp_flag_prev',
 'Cust_type_beg_Mth_prev',
 'Activity_flag_prev',
 'Join_date_month_prev',
 'Join_date_year_prev',
 'Saving_account_prev',
 'Guarantees_prev',
 'Cur_account_prev',
 'Derivative_account_prev',
 'Payroll_account_prev',
 'Junior_account_prev',
 'Particular_acct1_prev',
 'Particular_acct2_prev',
 'Particular_acct3_prev',
 'Short_term_deposites_prev',
 'Med_term_deposites_prev',
 'Long_term_deposites_prev',
 'e-account_prev',
 'Funds_prev',
 'Mortgage_prev',
 'Pension_prev',
 'Loans_prev',
 'Taxes_prev',
 'Credit_card_prev',
 'Securities_prev',
 'Home_account_prev',
 'Payroll_prev',
 'Pensions_prev',
 'Direct_debit_prev']].values
dtst = xgb.DMatrix(X_tst, feature_names=features)

ncodpers_tst = tst_final['Customer_ID'].values




In [93]:
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)

In [94]:
preds_tst

array([[6.2683839e-06, 3.5053850e-05, 3.2538825e-05, ..., 6.9934293e-03,
        9.3960967e-03, 7.8598911e-01],
       [5.6212293e-06, 2.4585427e-06, 1.4939081e-05, ..., 6.4773019e-03,
        5.3348811e-03, 3.3381757e-01],
       [4.9621290e-06, 4.5291740e-06, 1.4509635e-05, ..., 6.9556691e-02,
        8.2631074e-02, 6.4632791e-01],
       ...,
       [7.0372471e-06, 2.4279798e-06, 3.1417203e-05, ..., 3.2085784e-02,
        2.5130510e-02, 3.1063491e-01],
       [7.1773688e-06, 1.3571422e-05, 2.1059739e-05, ..., 4.5976648e-03,
        4.8623462e-03, 3.3530325e-01],
       [1.2284044e-05, 4.2635425e-06, 4.1497216e-05, ..., 1.6737445e-03,
        1.1427888e-02, 1.0965789e-01]], dtype=float32)

In [95]:
trn.columns = ['Month_status_date', 'Customer_ID', 'Employee_Index', 'Customer_country', 'Sex', 'Age', 'Join_date',
                'New_customer', 'Relnshp_Mnths', 'Relnshp_flag', 'Cust_type_beg_Mth', 'Cust_Reln_type_beg_mth',
                'Residence_flag', 'Forigner_flag', 'Channel_when_joined', 'Deceased_flag', 
                'Address_type', 'Customer_address', 'Address_detail', 'Activity_flag', 'Gross_household_income',
                'Segment', 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1',
               'ind_ctju_fin_ult1','ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1',
               'ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1',
               'ind_viv_fin_ult1','ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

In [96]:
trn

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_type,Customer_address,Address_detail,Activity_flag,Gross_household_income,Segment,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-01-28,1375586,N,ES,H,35,2015-01-12,0,6,1,UNKNOWN,1,A,S,N,UNKNOWN,KHL,N,MALAGA,1,87218.10,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2015-01-28,1050611,N,ES,V,23,2012-08-10,0,35,1,UNKNOWN,1,I,S,S,UNKNOWN,KHE,N,CIUDAD REAL,0,35548.74,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2015-01-28,1050612,N,ES,V,23,2012-08-10,0,35,1,UNKNOWN,1,I,S,N,UNKNOWN,KHE,N,CIUDAD REAL,0,122179.11,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2015-01-28,1050613,N,ES,H,22,2012-08-10,0,35,1,UNKNOWN,1,I,S,N,UNKNOWN,KHD,N,ZARAGOZA,0,119775.54,03 - UNIVERSITARIO,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2015-01-28,1050614,N,ES,V,23,2012-08-10,0,35,1,UNKNOWN,1,A,S,N,UNKNOWN,KHE,N,ZARAGOZA,1,99950.28,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13647304,2016-05-28,1166765,N,ES,V,22,2013-08-14,0,33,1,UNKNOWN,1,I,S,N,UNKNOWN,KHE,N,ZARAGOZA,0,43912.17,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13647305,2016-05-28,1166764,N,ES,V,23,2013-08-14,0,33,1,UNKNOWN,1,I,S,N,UNKNOWN,KHE,N,"RIOJA, LA",0,23334.99,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13647306,2016-05-28,1166763,N,ES,H,47,2013-08-14,0,33,1,UNKNOWN,1,A,S,N,UNKNOWN,KHE,N,ZARAGOZA,1,99950.28,02 - PARTICULARES,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13647307,2016-05-28,1166789,N,ES,H,22,2013-08-14,0,33,1,UNKNOWN,1,I,S,N,UNKNOWN,KHE,N,ZARAGOZA,0,199592.82,03 - UNIVERSITARIO,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [97]:
prods = trn.columns[22:].tolist()

In [98]:
prods

['ind_ahor_fin_ult1',
 'ind_aval_fin_ult1',
 'ind_cco_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_ctpp_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_plan_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_viv_fin_ult1',
 'ind_nomina_ult1',
 'ind_nom_pens_ult1',
 'ind_recibo_ult1']

In [99]:
tst_final.columns

Index(['Month_status_date', 'Customer_ID', 'Employee_Index',
       'Customer_country', 'Sex', 'Age', 'Join_date', 'New_customer',
       'Relnshp_Mnths', 'Relnshp_flag', 'Cust_type_beg_Mth',
       'Cust_Reln_type_beg_mth', 'Residence_flag', 'Forigner_flag',
       'Channel_when_joined', 'Deceased_flag', 'Address_detail',
       'Activity_flag', 'Gross_household_income', 'Segment', 'Saving_account',
       'Guarantees', 'Cur_account', 'Derivative_account', 'Payroll_account',
       'Junior_account', 'Particular_acct1', 'Particular_acct2',
       'Particular_acct3', 'Short_term_deposites', 'Med_term_deposites',
       'Long_term_deposites', 'e-account', 'Funds', 'Mortgage', 'Pension',
       'Loans', 'Taxes', 'Credit_card', 'Securities', 'Home_account',
       'Payroll', 'Pensions', 'Direct_debit', 'Join_date_month',
       'Join_date_year', 'int_date', 'Month_status_date_prev',
       'Employee_Index_prev', 'Customer_country_prev', 'Sex_prev', 'Age_prev',
       'Join_date_prev', 'New

In [100]:
for prod in prods:
    preds_tst = preds_tst - tst_final[['Saving_account_prev',
       'Guarantees_prev', 'Cur_account_prev', 'Derivative_account_prev',
       'Payroll_account_prev', 'Junior_account_prev', 'Particular_acct1_prev',
       'Particular_acct2_prev', 'Particular_acct3_prev',
       'Short_term_deposites_prev', 'Med_term_deposites_prev',
       'Long_term_deposites_prev', 'e-account_prev', 'Funds_prev',
       'Mortgage_prev', 'Pension_prev', 'Loans_prev', 'Taxes_prev',
       'Credit_card_prev', 'Securities_prev', 'Home_account_prev',
       'Payroll_prev', 'Pensions_prev', 'Direct_debit_prev']].values


# Create a submission file.
submit_file = open('Final_Lag_12.csv', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [101]:
for prod in prods:
    preds_tst = preds_tst - tst_final[['Saving_account_prev',
       'Guarantees_prev', 'Cur_account_prev', 'Derivative_account_prev',
       'Payroll_account_prev', 'Junior_account_prev', 'Particular_acct1_prev',
       'Particular_acct2_prev', 'Particular_acct3_prev',
       'Short_term_deposites_prev', 'Med_term_deposites_prev',
       'Long_term_deposites_prev', 'e-account_prev', 'Funds_prev',
       'Mortgage_prev', 'Pension_prev', 'Loans_prev', 'Taxes_prev',
       'Credit_card_prev', 'Securities_prev', 'Home_account_prev',
       'Payroll_prev', 'Pensions_prev', 'Direct_debit_prev']].values


# Create a submission file.
submit_file = open('Final_Lag_12.csv', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))

In [102]:
model.save_model('Santander_Final.json')

In [103]:
# dump model
model.dump_model('Santander_Final.raw.txt')
# dump model with feature map
#xgbModel.dump_model('Santander.raw.txt', 'Santander_feature.txt')

In [104]:
import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(model) 
  
# Load the pickled model 
knn_from_pickle = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions 
#knn_from_pickle.predict(X_test) 

[23:51:28] C:\Users\Administrator\Desktop\xgboost\src\learner.cc:362: Parameter 'gpu_id' has been recovered from the saved model. It will be set to 0 for prediction. To override the predictor behavior, explicitly set 'gpu_id' parameter as follows:
  * Python package: bst.set_param('gpu_id', [new value])
  * R package:      xgb.parameters(bst) <- list(gpu_id = [new value])
  * JVM packages:   bst.setParam("gpu_id", [new value])
[23:51:28] C:\Users\Administrator\Desktop\xgboost\src\learner.cc:362: Parameter 'predictor' has been recovered from the saved model. It will be set to 'gpu_predictor' for prediction. To override the predictor behavior, explicitly set 'predictor' parameter as follows:
  * Python package: bst.set_param('predictor', [new value])
  * R package:      xgb.parameters(bst) <- list(predictor = [new value])
  * JVM packages:   bst.setParam("predictor", [new value])


In [105]:
import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(model, 'Santander_Pickle_lag12.pkl') 
  
# Load the model from the file 
#knn_from_joblib = joblib.load('filename.pkl') 

['Santander_Pickle_lag12.pkl']

In [106]:
# Create a submission file.
#submit_file = open('Improve_Submission', 'w')
#submit_file.write('ncodpers,added_products\n')


In [107]:
#pred_pppp=np.fliplr(preds_ttt)[:,:7]

In [108]:
#pred_pppp

In [109]:
#test_id = np.array(pd.read_csv(data_path + "test_ver2.csv", usecols=['ncodpers'])['ncodpers'])
#final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
#out_df = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds})

In [110]:
#test_preds = [] #Saving test preds
#for row in sample.values:
#    id = row[0]
#    p = train_preds[id]
#    test_preds.append(' '.join(p)) ##Join

In [111]:
#XY_trn.head()

In [112]:
trn[trn['New_customer'] == 1]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_type,Customer_address,Address_detail,Activity_flag,Gross_household_income,Segment,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
261,2015-01-28,1050741,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1029,2015-01-28,1051017,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1063,2015-01-28,1051064,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1154,2015-01-28,1051387,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1779,2015-01-28,1048660,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13641337,2016-05-28,1168909,N,ES,V,43,2013-08-23,1,0,1,UNKNOWN,3,P,S,N,UNKNOWN,UNKNOWN,N,PONTEVEDRA,1,97829.10,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13642462,2016-05-28,1173729,N,ES,H,33,2013-09-09,1,1,1,UNKNOWN,1,A,S,S,UNKNOWN,UNKNOWN,N,CORDOBA,1,69106.89,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13642971,2016-05-28,1172024,N,ES,H,42,2016-01-12,1,4,1,UNKNOWN,1,A,S,N,UNKNOWN,KHM,N,BARCELONA,1,131214.48,02 - PARTICULARES,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1
13646506,2016-05-28,1166395,N,ES,V,35,2013-08-14,1,3,1,UNKNOWN,1,A,S,N,UNKNOWN,KHN,N,BARCELONA,1,204208.02,01 - TOP,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [113]:
trn[trn['Customer_ID'] == 1051064]

Unnamed: 0,Month_status_date,Customer_ID,Employee_Index,Customer_country,Sex,Age,Join_date,New_customer,Relnshp_Mnths,Relnshp_flag,Cust_type_beg_Mth,Cust_Reln_type_beg_mth,Residence_flag,Forigner_flag,Channel_when_joined,Deceased_flag,Address_type,Customer_address,Address_detail,Activity_flag,Gross_household_income,Segment,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,ind_deme_fin_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_fond_fin_ult1,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1063,2015-01-28,1051064,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1243712,2015-02-28,1051064,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1848166,2015-03-28,1051064,N,ES,UNKNOWN,40,2011-09-06,1,0,1,UNKNOWN,P,A,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,N,UNKNOWN,0,137860.11,UNKNOWN,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [114]:
rr = pd.read_csv('Final_Lag_12.csv')

In [115]:
rr.shape

(929615, 2)