In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt,gc,os
from tqdm.auto import tqdm
import itertools
tqdm.pandas()
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#print('RAPIDS version',cudf.__version__)

In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = 0 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

GPU = False

cudf=pd

if GPU:
    TRAIN_NUM_PARTS = 5
    TEST_SECTIONS = 2
    TEST_NUM_PARTS = 2
else:
    TRAIN_NUM_PARTS = 10
    TEST_SECTIONS = 2
    TEST_NUM_PARTS = 2

In [3]:
def read_file(path="",usecols=None):
    if usecols is not None: 
        df = pd.read_parquet(path,columns = usecols)
    else: 
        df = pd.read_parquet(path)
        
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    if GPU:
            df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    else:
            df['customer_ID'] = df['customer_ID'].str[-16:].apply(int, base=16).astype('int64')

    print('shape of data:', df.shape)
        
    return df

In [4]:
## CALCULATE SIZE OF EACH SEPARATE PART
def get_rows(customers,df,NUM_PARTS=4,verbose=''):
    chunk = len(customers)//NUM_PARTS
    if verbose!='':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows=[]
    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc= customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = df.loc[df.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk


def getAndProcessDataInChunks(filename,is_train=False,NUM_PARTS=4,NUM_SECTIONS=1,split_k=0,verbose=''):
    gc.collect()
    
    print(f'Reading customer_IDs from {verbose} data..')
    df = read_file(path = filename,usecols =['customer_ID','S_2'])
    customers = df[['customer_ID']].drop_duplicates().sort_index().values.flatten()
    rows,num_cust = get_rows(customers,df[['customer_ID']],NUM_PARTS=NUM_PARTS*NUM_SECTIONS,verbose=verbose)
    
    # INFER DATA IN PARTS
    skip_rows = 0
    skip_cust = 0
    allData = []
    
    del df
    gc.collect()
    
    print(f'\nReading {verbose} data...')
    df_file = read_file(path = filename)
    
    if is_train:
        targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
        if GPU:
            targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
        else:
            targets['customer_ID'] = targets['customer_ID'].str[-16:].apply(int, base=16).astype('int64')
        
        targets = targets.set_index('customer_ID')
        targets.target = targets.target.astype('int8')
        
    if NUM_SECTIONS > 1:
        startRow = 0
        for i in range(NUM_SECTIONS):
            if i==split_k:
                startRow = skip_rows
            for k in range(NUM_PARTS):
                skip_rows += rows[i*NUM_PARTS + k]
            if i == split_k:
                df_file = df_file.iloc[startRow:skip_rows].reset_index(drop=True)
                rows = rows[i*NUM_PARTS:(i+1)*NUM_PARTS]
                gc.collect()
                skip_rows = 0
                break
    
    for k in range(NUM_PARTS):
        # READ PART OF DATA
        df = df_file.iloc[skip_rows:skip_rows+rows[k]]
        skip_rows+= rows[k]
        print(f'=> {verbose} part {k+1} has shape', df.shape )
        
        # PROCESS AND FEATURE ENGINEER PART OF DATA
        df = process_and_feature_engineer(df)
        
        if is_train:
            
            if k==NUM_PARTS-1: targetSlice = targets.iloc[skip_cust:]
            else: targetSlice = targets.iloc[skip_cust:skip_cust+num_cust]
            skip_cust += num_cust

            print("|...")
            df = cudf.concat([df,targetSlice],axis=1)
            print(" ...|")
            
            print('Targetslice Merged')
            
        if GPU:
            df = df.to_pandas()
            
        allData.append(df)
        print('Appended')
        gc.collect()
 

    print(".", end='')
    del df_file
    gc.collect()
    allData = pd.concat(allData, axis=0)
    del df
    gc.collect()
    if is_train:
        print(".", end='')
        allData = allData.sort_index()
        gc.collect()
        print(".", end='')
        allData = allData.reset_index()
    print("|")
    return allData

In [5]:
def process_and_feature_engineer(train):
    
    
    year = pd.to_numeric(train['S_2'].str[:4])
    month = pd.to_numeric(train['S_2'].str[5:7])
    
    train.S_2 = pd.to_datetime( train.S_2 )

    train["SDist"]=train.groupby("customer_ID")["S_2"].diff() / np.timedelta64(1, 'D')
    # Impute with average distance 30.53 days
    train['SDist'].fillna(30.53, inplace=True)
    
    train['S_2'] = year.mul(12).add(month).sub(24207).astype('int8')
    
    ###############################################################################
    
    train.drop(["B_29","S_9"], axis=1, inplace = True)
    
    ## For each customer, count all NaN in any row, and count all NaN in the last row. Later we will add it as two columns
    df_nan = (train.mul(0) + 1).fillna(0)
    df_nan['customer_ID'] = train['customer_ID']
    nan_sum = df_nan.groupby("customer_ID").sum().sum(axis=1)
    nan_last = df_nan.groupby("customer_ID").last().sum(axis=1)
    del df_nan
    gc.collect()
    
    ########################################################################
    
    #https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
    train.loc[(train.R_13==0) & (train.R_17==0) & (train.R_20==0) & (train.R_8==0), 'R_6'] = 0
    train.loc[train.B_39==-1, 'B_36'] = 0
    
    all_cols = [c for c in list(train.columns) if c not in  ['customer_ID','S_2']]
    cat_features = ['B_30','B_38','D_114','D_117','D_120', 'D_66','D_64','D_63', 'D_126']
    num_features = [col for col in all_cols if col not in cat_features]
    
    ######################################################################
    
    delta_cols = ['S_27', 'S_7', 'S_5', 'R_5', 'S_11', 'D_142', 'D_43', 'D_136', 'S_22', 'D_110', 'R_28', 'B_6', 
                  'D_102', 'D_130', 'D_46', 'D_137', 'B_1', 'D_96', 'R_14', 'D_72', 'S_12', 'S_13', 'D_88', 'B_24',
                  'S_25', 'D_44', 'B_9', 'B_17', 'B_42', 'D_53', 'D_58', 'R_11', 'D_138', 'B_4', 'B_3', 'R_16', 
                  'SDist', 'D_48', 'D_84', 'R_7', 'B_2', 'D_39', 'D_41', 'D_82', 'D_49', 'B_16']
    
    ### Add delta
    test_num_agg2 = train.groupby("customer_ID")[delta_cols].nth(-1) - train.groupby("customer_ID")[delta_cols].nth(-2)
    test_num_agg2 = test_num_agg2.fillna(0)
    test_num_agg2.columns = [x + '_delta' for x in test_num_agg2.columns]
    
    print('Delta features Done')
    
    #####################################################################
    
    test_s2_agg = train.groupby("customer_ID")[["S_2"]].agg(['min', 'max'])
    test_s2_agg.columns = ['_'.join(x) for x in test_s2_agg.columns]
    test_s2_agg['S_2_min'] = test_s2_agg['S_2_min'] + 12 - test_s2_agg['S_2_max']
    test_s2_agg.drop(['S_2_max'],inplace=True,axis=1)
    
    #########################################################################
    
    ####### NUM AGG FEATURES ######
    
    std_cols = ['S_16', 'D_108', 'D_39', 'D_141', 'S_22', 'R_8', 'B_40', 'P_2', 'D_55', 'R_10', 'D_65', 'D_106', 
                'D_81', 'B_3', 'D_138', 'S_8', 'D_75', 'D_94', 'B_15', 'B_10', 'B_27', 'D_82', 'D_78', 'B_5', 'B_6',
                'B_9', 'D_110', 'D_88', 'D_83', 'B_21', 'D_130', 'B_42', 'R_28', 'D_124', 'R_23', 'D_96', 'D_144', 
                'B_37', 'R_15', 'B_7', 'D_49', 'D_69', 'D_115', 'R_12', 'D_112', 'D_92', 'D_72', 'S_24', 'S_12', 
                'D_125', 'S_5', 'D_51', 'D_123', 'B_4', 'R_14', 'B_19', 'S_13', 'B_2', 'B_17', 'D_41', 'B_14', 'B_41',
                'B_39', 'D_47', 'S_20', 'R_3', 'D_87', 'D_84', 'S_15', 'B_12', 'D_137', 'B_23', 'SDist', 'D_45', 
                'S_18', 'R_20', 'P_3', 'R_7', 'D_61', 'B_16', 'S_23']

    test_num_agg_std = train.groupby(by=['customer_ID'])[std_cols].std()
    test_num_agg_std.columns = [x+'_std' for x in test_num_agg_std.columns]
    
    max_cols = ['D_58', 'R_12', 'B_28', 'B_40', 'S_23', 'B_2', 'D_47', 'D_134', 'D_94', 'R_5', 'D_62', 'S_26', 
                'D_42', 'D_79', 'S_11', 'R_14', 'B_41', 'D_50', 'D_60', 'S_22', 'B_11', 'D_75', 'D_51', 'R_28', 
                'D_61', 'D_132', 'P_3', 'SDist', 'D_76', 'S_20', 'D_92', 'D_121', 'D_43', 'D_127', 'R_21', 'B_21', 
                'B_5', 'D_48', 'S_24', 'D_84', 'D_55', 'S_15', 'D_110', 'D_56', 'D_133', 'S_12', 'D_65', 'D_54', 
                'R_9', 'B_42', 'S_18', 'D_93', 'D_111', 'D_53', 'D_82', 'D_59', 'B_4', 'R_23', 'D_108', 'D_70', 
                'D_96', 'D_141', 'B_15', 'D_77', 'B_1', 'D_125', 'D_102', 'D_41', 'D_49', 'D_80', 'B_13', 'D_105', 
                'D_86', 'D_142', 'D_52', 'D_39', 'B_22', 'D_45', 'S_8', 'B_3', 'D_122', 'D_69', 'S_6', 'D_123', 
                'B_12', 'B_14', 'D_124','P_2']
    
    test_num_agg_max = train.groupby(by=['customer_ID'])[max_cols].max()
    test_num_agg_max.columns = [x+'_max' for x in test_num_agg_max.columns]
    
    min_cols = ['R_27', 'D_46', 'S_7', 'D_118', 'R_12', 'B_11', 'S_5', 'B_5', 'D_76', 'D_47', 'D_119', 'R_2', 
                'D_70', 'D_48', 'B_16', 'R_28', 'D_53', 'D_111', 'D_51', 'B_13', 'P_4', 'D_56', 'S_19', 'B_6',
                'D_135', 'R_4', 'D_52', 'S_23', 'D_141', 'R_9', 'D_61', 'S_15', 'D_109', 'S_12', 'B_39', 'D_62',
                'D_132', 'D_112', 'B_20', 'R_6', 'D_110', 'D_104', 'B_32', 'D_91', 'D_140', 'B_40', 'B_9', 'R_8',
                'D_92', 'S_3', 'B_18', 'D_42', 'D_93', 'B_31', 'P_2', 'D_125', 'SDist', 'B_42', 'S_8', 'R_5', 
                'R_11', 'B_33', 'D_83', 'R_23', 'D_127', 'S_25', 'D_121', 'D_94', 'D_65', 'S_2', 'B_8', 'S_11', 
                'D_134', 'B_17', 'D_39', 'S_6', 'D_45', 'D_122', 'D_59', 'D_84']

    test_num_agg_min = train.groupby(by=['customer_ID'])[min_cols].min()
    test_num_agg_min.columns = [x+'_min' for x in test_num_agg_min.columns]
    
    quantile_cols = ['D_127', 'B_22', 'D_76', 'R_14', 'D_121', 'S_12', 'B_2', 'D_56', 'D_50', 'D_47', 'SDist', 
                     'R_5', 'R_4', 'D_87', 'D_128', 'D_139', 'D_45', 'D_48', 'D_59', 'S_18', 'D_65', 'S_15', 
                     'D_141', 'S_13', 'D_143', 'B_13', 'D_119', 'S_26', 'D_43', 'D_134', 'D_51', 'R_7', 'B_17', 
                     'D_94', 'B_3', 'D_122', 'S_3', 'D_91', 'D_72', 'P_4', 'B_40', 'R_8', 'S_24', 'B_10', 'D_60', 
                     'D_71', 'B_42', 'D_111', 'D_110', 'B_9', 'D_61', 'P_2', 'R_26', 'D_96', 'D_70', 'B_14', 'D_75',
                     'D_92', 'D_74', 'R_25', 'D_107', 'D_42', 'R_9', 'D_62', 'R_27', 'D_93', 'D_125', 'B_32', 'B_5',
                     'D_105', 'R_24', 'D_44']
    
    test_num_agg_quantile = train.groupby(by=['customer_ID'])[quantile_cols].quantile()
    test_num_agg_quantile.columns = [x+'_quantile' for x in test_num_agg_quantile.columns]
    
    last_cols = ['D_83', 'B_9', 'R_5', 'R_11', 'R_16', 'D_50', 'S_25', 'D_111', 'B_4', 'D_76', 'R_2', 
                 'S_24', 'S_3', 'P_2', 'D_119', 'B_15', 'R_9', 'B_24', 'D_140', 'D_52', 'SDist', 'D_79', 
                 'B_23', 'D_127', 'D_49', 'B_1', 'D_59', 'S_26', 'R_20', 'B_18', 'S_23', 'D_91', 'D_139', 'S_13',
                 'R_24', 'D_138', 'D_45', 'B_17', 'D_94', 'B_41', 'D_56', 'D_47', 'D_78', 'D_135', 'R_8', 'D_43', 
                 'R_3', 'D_86', 'B_40', 'S_8', 'B_3', 'D_41', 'S_27', 'S_12', 'D_124', 'D_54', 'R_18', 'B_20', 
                 'D_110', 'D_121', 'D_62', 'D_82', 'B_25', 'D_92', 'B_5', 'R_12', 'B_39', 'D_108',
                 'S_7', 'B_11', 'D_75', 'B_42', 'B_22', 'B_14', 'R_23', 'D_42', 'R_27', 'D_112', 'B_2', 
                 'D_93', 'D_122', 'D_133', 'D_106', 'R_25', 'B_33', 'D_141', 'B_6', 'D_51', 'D_96',
                 'S_6', 'D_46', 'B_31', 'S_20', 'R_7', 'B_7', 'B_10', 'R_1', 'D_60', 'D_61', 'B_13', 'R_10', 'D_129',
                 'D_53', 'R_4', 'D_39','R_14', 'S_16', 'B_28', 'D_58', 'D_48', 'D_55', 'D_88', 'D_69', 'S_5','D_44',
                 'P_3','D_131','B_37','D_144','D_102','B_8','D_73','S_17','D_130','S_22']
    
    test_num_agg_last = train.groupby(by=['customer_ID'])[last_cols].last()
    test_num_agg_last.columns = [x+'_last' for x in test_num_agg_last.columns]
    
    mean_cols = ['B_20', 'R_23', 'D_59', 'D_140', 'D_75', 'D_106', 'R_8', 'D_130', 'R_26', 'D_74', 'R_15', 'D_52', 
                 'D_54', 'R_9', 'D_121', 'D_113', 'S_26', 'B_14', 'D_50', 'S_12', 'S_15', 'D_93', 'B_32', 'D_73',
                 'D_111', 'D_108', 'D_77', 'R_21', 'SDist', 'B_9', 'B_17', 'D_141', 'S_5', 'D_96', 'B_21', 'R_1', 
                 'D_48', 'D_61', 'R_5', 'S_16', 'D_92', 'D_91', 'D_45', 'B_40', 'D_110', 'B_18', 'D_82', 'R_27', 
                 'S_3', 'D_71', 'B_42', 'R_16', 'P_2', 'D_44', 'D_145', 'D_122', 'B_4', 'D_55', 'B_15', 'D_104', 
                 'D_62', 'S_6', 'B_5', 'D_94', 'D_72', 'S_11', 'D_127', 'D_142', 'D_143', 'D_76', 'B_41', 'S_18',
                 'D_60', 'R_3', 'R_11', 'B_7', 'D_118', 'D_51', 'R_25', 'B_2', 'D_42', 'B_22', 'D_47', 'D_43', 
                 'B_12','D_41', 'D_144', 'R_14', 'S_17', 'B_23', 'D_56', 'S_22', 'B_10', 'B_3', 'B_1', 'B_28', 
                 'B_6', 'D_131', 'S_24', 'D_46', 'D_58', 'D_53', 'S_7']
    
    test_num_agg_mean = train.groupby(by=['customer_ID'])[mean_cols].mean()
    test_num_agg_mean.columns = [x+'_mean' for x in test_num_agg_mean.columns]
    
    first_cols = ['D_124', 'D_54', 'R_3', 'D_135', 'D_60', 'B_6', 'D_74', 'S_25', 'D_102', 'D_145', 'R_28', 'B_42',
                  'R_25', 'S_22', 'S_13', 'D_93', 'D_65', 'D_141', 'D_55', 'D_119', 'P_2', 'B_20', 'R_14', 'D_75',
                  'B_9', 'D_94', 'S_26', 'D_127', 'D_42', 'D_123', 'D_52', 'B_39', 'R_8', 'D_88', 'D_45', 'P_3', 
                  'B_19', 'D_76', 'D_105', 'D_92', 'D_78', 'B_17', 'R_9', 'D_62', 'D_134', 'D_56', 'B_16', 'D_118',
                  'D_82', 'D_110', 'D_142', 'D_121', 'D_47', 'D_91', 'D_61', 'D_53', 'D_46', 'D_51', 'D_132', 'S_8',
                  'D_96','B_5', 'D_50', 'SDist', 'D_41', 'D_144', 'D_48', 'R_12', 'B_18', 'B_8', 'B_3', 'S_5', 
                  'D_112', 'D_58', 'D_69', 'B_15']
    
    test_num_agg_first = train.groupby(by=['customer_ID'])[first_cols].first()
    test_num_agg_first.columns = [x+'_first' for x in test_num_agg_first.columns]
    
    test_num_agg = pd.concat([test_num_agg_quantile,test_num_agg_mean,test_num_agg_last,
                            test_num_agg_first,test_num_agg_max,test_num_agg_std,test_num_agg_min],axis=1)
    
    del test_num_agg_quantile,test_num_agg_mean,test_num_agg_last,test_num_agg_first,test_num_agg_max,test_num_agg_std,test_num_agg_min
    
    cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    test_num_agg.loc[:,cols] = test_num_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.float32))
    
    test_cat_agg = train.groupby("customer_ID")[cat_features].agg(['last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    
    # Transform int64 columns to int32
    cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    test_cat_agg.loc[:,cols] = test_cat_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.int32))
    
    #add last statement date, statements count and "new customer" category (LT=0.5)
    test_date_agg = train.groupby("customer_ID")[["S_2","B_3","D_104"]].agg(['last','count'])
    test_date_agg.columns = ['_'.join(x) for x in test_date_agg.columns]
    test_date_agg.rename(columns = {'S_2_count':'LT'}, inplace = True)
    test_date_agg.loc[(test_date_agg.B_3_last.isnull()) & (test_date_agg.LT==1),'LT'] = 0.5
    test_date_agg.loc[(test_date_agg.D_104_last.isnull()) & (test_date_agg.LT==1),'LT'] = 0.5
    test_date_agg.drop(["B_3_last","D_104_last","B_3_count","D_104_count",'S_2_last'], axis=1, inplace = True)
    
    train= pd.concat([test_cat_agg,test_s2_agg,test_num_agg2,test_date_agg,test_num_agg],axis=1)
    del test_cat_agg,test_s2_agg,test_num_agg2,test_date_agg,test_num_agg
    
    print('shape after engineering1', train.shape )
    
    print('Aggregation Done')
                  
    ####################################################################################              
    
    cat_features = [f"{cf}_last" for cf in cat_features]
    
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col]) 
        
    print('Encoding done')
                  
    #######################################################################################              
        
    last_round_cols = ['D_42_last', 'B_17_last', 'B_3_last', 'D_62_last', 'B_18_last', 'R_12_last', 
                       'D_133_last', 'D_46_last', 'B_2_last', 'S_12_last', 'B_11_last', 'D_58_last',
                       'D_43_last', 'B_15_last', 'SDist_last', 'B_10_last', 'B_1_last', 'S_25_last', 
                       'D_112_last', 'S_3_last', 'R_1_last', 'D_47_last', 'D_50_last', 'D_55_last', 
                       'D_61_last', 'B_24_last', 'R_14_last', 'D_76_last', 'B_5_last', 'S_23_last', 
                       'D_48_last', 'D_45_last', 'B_42_last', 'B_6_last', 'D_110_last', 'S_5_last', 
                       'S_16_last', 'R_7_last', 'R_27_last', 'B_23_last', 'D_60_last', 'B_9_last', 
                       'B_28_last', 'D_88_last', 'D_69_last', 'D_121_last', 'S_24_last', 'D_56_last',
                       'D_119_last', 'D_41_last', 'B_39_last', 'D_52_last']
    
    for col in last_round_cols:
        train[col + '_round2'] = train[col].round(2)
        
    print('Rounding done') 
                  
    #####################################################################################              
    
    train['P_2_max-P_2_min'] = train['P_2_max']-train['P_2_min']
    train['P_2_max-P_2_min'] = train['P_2_max-P_2_min'].fillna(0) 
    
    train['D_44_last-B_37_last'] = train['D_44_last']-train['B_37_last']
    train['D_44_last-B_37_last'] = train['D_44_last-B_37_last'].fillna(0)
    
    train['D_44_last-P_2_last'] = train['D_44_last']-train['P_2_last']
    train['D_44_last-P_2_last'] = train['D_44_last-P_2_last'].fillna(0)
    
    train['D_44_last-P_3_last'] = train['D_44_last']-train['P_3_last']
    train['D_44_last-P_3_last'] = train['D_44_last-P_3_last'].fillna(0)
    
    train['B_23_last-P_2_last'] = train['B_23_last']-train['P_2_last']
    train['B_23_last-P_2_last'] = train['B_23_last-P_2_last'].fillna(0)
    
    train['B_23_last-P_3_last'] = train['B_23_last']-train['P_3_last']
    train['B_23_last-P_3_last'] = train['B_23_last-P_3_last'].fillna(0)
    
    train['P_3_last-D_131_last'] = train['P_3_last']-train['D_131_last']
    train['P_3_last-D_131_last'] = train['P_3_last-D_131_last'].fillna(0)
    
    train['P_2_last-D_131_last'] = train['P_2_last']-train['D_131_last']
    train['P_2_last-D_131_last'] = train['P_2_last-D_131_last'].fillna(0)
    
    train['B_17_last-P_2_last'] = train['B_17_last']-train['P_2_last']
    train['B_17_last-P_2_last'] = train['B_17_last-P_2_last'].fillna(0)
    
    train['B_17_last-P_3_last'] = train['B_17_last']-train['P_3_last']
    train['B_17_last-P_3_last'] = train['B_17_last-P_3_last'].fillna(0)
    
    train['B_14_last-P_2_last'] = train['B_14_last']-train['P_2_last']
    train['B_14_last-P_2_last'] = train['B_14_last-P_2_last'].fillna(0)
    
    train['B_11_last-P_2_last'] = train['B_11_last']-train['P_2_last']
    train['B_11_last-P_2_last'] = train['B_11_last-P_2_last'].fillna(0)
    
    train['B_14_last-P_3_last'] = train['B_14_last']-train['P_3_last']
    train['B_14_last-P_3_last'] = train['B_14_last-P_3_last'].fillna(0)
    
    train['B_11_last-P_3_last'] = train['B_11_last']-train['P_3_last']
    train['B_11_last-P_3_last'] = train['B_11_last-P_3_last'].fillna(0)
    
    train['B_2_last-P_2_last'] = train['B_2_last']-train['P_2_last']
    train['B_2_last-P_2_last'] = train['B_2_last-P_2_last'].fillna(0)
    
    train['B_2_last-P_3_last'] = train['B_2_last']-train['P_3_last']
    train['B_2_last-P_3_last'] = train['B_2_last-P_3_last'].fillna(0)
    
    train['D_48_last-B_2_last'] = train['D_48_last']-train['B_2_last']
    train['D_48_last-B_2_last'] = train['D_48_last-B_2_last'].fillna(0)
    
    train['D_48_last-B_1_last'] = train['D_48_last']-train['B_1_last']
    train['D_48_last-B_1_last'] = train['D_48_last-B_1_last'].fillna(0)
    
    train['D_42_last-P_2_last'] = train['D_42_last']-train['P_2_last']
    train['D_42_last-P_2_last'] = train['D_42_last-P_2_last'].fillna(0)
    
    train['D_42_last-P_3_last'] = train['D_42_last']-train['P_3_last']
    train['D_42_last-P_3_last'] = train['D_42_last-P_3_last'].fillna(0)
    
    train['D_39_last-P_2_last'] = train['D_39_last']-train['P_2_last']
    train['D_39_last-P_2_last'] = train['D_39_last-P_2_last'].fillna(0)
    
    train['D_39_last-P_3_last'] = train['D_39_last']-train['P_3_last']
    train['D_39_last-P_3_last'] = train['D_39_last-P_3_last'].fillna(0)
    
    train['B_9_last-P_2_last'] = train['B_9_last']-train['P_2_last']
    train['B_9_last-P_2_last'] = train['B_9_last-P_2_last'].fillna(0)
    
    train['B_9_last-P_3_last'] = train['B_9_last']-train['P_3_last']
    train['B_9_last-P_3_last'] = train['B_9_last-P_3_last'].fillna(0)
    
    train['B_4_last-D_62_last'] = train['B_4_last']-train['D_62_last']
    train['B_4_last-D_62_last'] = train['B_4_last-D_62_last'].fillna(0)
    
    train['P_3_last-S_23_last'] = train['P_3_last']-train['S_23_last']
    train['P_3_last-S_23_last'] = train['P_3_last-S_23_last'].fillna(0)
    
    train['P_2_last-S_23_last'] = train['P_2_last']-train['S_23_last']
    train['P_2_last-S_23_last'] = train['P_2_last-S_23_last'].fillna(0)
    
    train['P_3_last-S_16_last'] = train['P_3_last']-train['S_16_last']
    train['P_3_last-S_16_last'] = train['P_3_last-S_16_last'].fillna(0)
    
    train['P_2_last-S_16_last'] = train['P_2_last']-train['S_16_last']
    train['P_2_last-S_16_last'] = train['P_2_last-S_16_last'].fillna(0)
    
    print('Miscellaneous features Done')
                  
    ########################################################################              
    
    ####### Last-First ########
    
    train['S_5_last_first_diff'] = train['S_5_last'] - train['S_5_first']
    train['S_5_last_first_diff'] = train['S_5_last_first_diff'].fillna(0)
    
    train['R_14_last_first_diff'] = train['R_14_last'] - train['R_14_first']
    train['R_14_last_first_diff'] = train['R_14_last_first_diff'].fillna(0)
        
    train['SDist_last_first_diff'] = train['SDist_last'] - train['SDist_first']
    train['SDist_last_first_diff'] = train['SDist_last_first_diff'].fillna(0)
        
    train['D_112_last_first_diff'] = train['D_112_last'] - train['D_112_first']
    train['D_112_last_first_diff'] = train['D_112_last_first_diff'].fillna(0)
        
    train['B_3_last_first_diff'] = train['B_3_last'] - train['B_3_first']
    train['B_3_last_first_diff'] = train['B_3_last_first_diff'].fillna(0)
        
    train['B_9_last_first_diff'] = train['B_9_last'] - train['B_9_first']
    train['B_9_last_first_diff'] = train['B_9_last_first_diff'].fillna(0)  
                  
    train['D_144_last_first_diff'] = train['D_144_last'] - train['D_144_first']
    train['D_144_last_first_diff'] = train['D_144_last_first_diff'].fillna(0)  
                  
    train['B_15_last_first_diff'] = train['B_15_last'] - train['B_15_first']
    train['B_15_last_first_diff'] = train['B_15_last_first_diff'].fillna(0)       
      
    train['B_8_last_first_diff'] = train['B_8_last'] - train['B_8_first']
    train['B_8_last_first_diff'] = train['B_8_last_first_diff'].fillna(0)
                  
    train['D_102_last_first_diff'] = train['D_102_last'] - train['D_102_first']
    train['D_102_last_first_diff'] = train['D_102_last_first_diff'].fillna(0)  
                  
    train['D_41_last_first_diff'] = train['D_41_last'] - train['D_41_first']
    train['D_41_last_first_diff'] = train['D_41_last_first_diff'].fillna(0)  
                  
    train['D_48_last_first_diff'] = train['D_48_last'] - train['D_48_first']
    train['D_48_last_first_diff'] = train['D_48_last_first_diff'].fillna(0)  
                  
    train['D_69_last_first_diff'] = train['D_69_last'] - train['D_69_first']
    train['D_69_last_first_diff'] = train['D_69_last_first_diff'].fillna(0)  
                  
    train['R_12_last_first_diff'] = train['R_12_last'] - train['R_12_first']
    train['R_12_last_first_diff'] = train['R_12_last_first_diff'].fillna(0)  
                  
    train['D_50_last_first_diff'] = train['D_50_last'] - train['D_50_first']
    train['D_50_last_first_diff'] = train['D_50_last_first_diff'].fillna(0)  
                  
    train['B_17_last_first_diff'] = train['B_17_last'] - train['B_17_first']
    train['B_17_last_first_diff'] = train['B_17_last_first_diff'].fillna(0)  
                  
    train['D_141_last_first_diff'] = train['D_141_last'] - train['D_141_first']       
    train['D_141_last_first_diff'] = train['D_141_last_first_diff'].fillna(0)  
                  
    train['D_88_last_first_diff'] = train['D_88_last'] - train['D_88_first']
    train['D_88_last_first_diff'] = train['D_88_last_first_diff'].fillna(0)  
                  
    train['P_3_last_first_diff'] = train['P_3_last'] - train['P_3_first']
    train['P_3_last_first_diff'] = train['P_3_last_first_diff'].fillna(0)  
                  
    train['B_18_last_first_diff'] = train['B_18_last'] - train['B_18_first']
    train['B_18_last_first_diff'] = train['B_18_last_first_diff'].fillna(0)  
                  
    train['D_55_last_first_diff'] = train['D_55_last'] - train['D_55_first']
    train['D_55_last_first_diff'] = train['D_55_last_first_diff'].fillna(0)  
                  
    train['D_58_last_first_diff'] = train['D_58_last'] - train['D_58_first']
    train['D_58_last_first_diff'] = train['D_58_last_first_diff'].fillna(0)  
                  
    train['B_5_last_first_diff'] = train['B_5_last'] - train['B_5_first']
    train['B_5_last_first_diff'] = train['B_5_last_first_diff'].fillna(0)  
    
    print('Last-First Features Done')
                  
    ##################################################################################              
                  
    ####### last-mean #######        
                  
    train['D_58_last_mean_diff'] = train['D_58_last'] - train['D_58_mean']
    train['D_58_last_mean_diff'] = train['D_58_last_mean_diff'].fillna(0)
                  
    train['B_7_last_mean_diff'] = train['B_7_last'] - train['B_7_mean']
    train['B_7_last_mean_diff'] = train['B_7_last_mean_diff'].fillna(0)
                  
    train['B_14_last_mean_diff'] = train['B_14_last'] - train['B_14_mean']
    train['B_14_last_mean_diff'] = train['B_14_last_mean_diff'].fillna(0)
                  
    train['D_41_last_mean_diff'] = train['D_41_last'] - train['D_41_mean']
    train['D_41_last_mean_diff'] = train['D_41_last_mean_diff'].fillna(0)
                  
    train['D_73_last_mean_diff'] = train['D_73_last'] - train['D_73_mean']
    train['D_73_last_mean_diff'] = train['D_73_last_mean_diff'].fillna(0)
                  
    train['B_1_last_mean_diff'] = train['B_1_last'] - train['B_1_mean']
    train['B_1_last_mean_diff'] = train['B_1_last_mean_diff'].fillna(0)
                  
    train['D_55_last_mean_diff'] = train['D_55_last'] - train['D_55_mean']
    train['D_55_last_mean_diff'] = train['D_55_last_mean_diff'].fillna(0)
      
    train['D_48_last_mean_diff'] = train['D_48_last'] - train['D_48_mean']
    train['D_48_last_mean_diff'] = train['D_48_last_mean_diff'].fillna(0)
                  
    train['S_3_last_mean_diff'] = train['S_3_last'] - train['S_3_mean']
    train['S_3_last_mean_diff'] = train['S_3_last_mean_diff'].fillna(0)
                  
    train['R_14_last_mean_diff'] = train['R_14_last'] - train['R_14_mean']
    train['R_14_last_mean_diff'] = train['R_14_last_mean_diff'].fillna(0)
                  
    train['B_15_last_mean_diff'] = train['B_15_last'] - train['B_15_mean']
    train['B_15_last_mean_diff'] = train['B_15_last_mean_diff'].fillna(0)
                  
    train['B_23_last_mean_diff'] = train['B_23_last'] - train['B_23_mean']
    train['B_23_last_mean_diff'] = train['B_23_last_mean_diff'].fillna(0)
                  
    train['D_46_last_mean_diff'] = train['D_46_last'] - train['D_46_mean']
    train['D_46_last_mean_diff'] = train['D_46_last_mean_diff'].fillna(0)
                  
    train['B_9_last_mean_diff'] = train['B_9_last'] - train['B_9_mean']
    train['B_9_last_mean_diff'] = train['B_9_last_mean_diff'].fillna(0)
                  
    train['B_5_last_mean_diff'] = train['B_5_last'] - train['B_5_mean']
    train['B_5_last_mean_diff'] = train['B_5_last_mean_diff'].fillna(0)
                  
    train['D_131_last_mean_diff'] = train['D_131_last'] - train['D_131_mean']
    train['D_131_last_mean_diff'] = train['D_131_last_mean_diff'].fillna(0)
                  
    train['SDist_last_mean_diff'] = train['SDist_last'] - train['SDist_mean']
    train['SDist_last_mean_diff'] = train['SDist_last_mean_diff'].fillna(0)
                  
    train['S_12_last_mean_diff'] = train['S_12_last'] - train['S_12_mean']
    train['S_12_last_mean_diff'] = train['S_12_last_mean_diff'].fillna(0)
                  
    train['S_17_last_mean_diff'] = train['S_17_last'] - train['S_17_mean']
    train['S_17_last_mean_diff'] = train['S_17_last_mean_diff'].fillna(0)
                  
    train['D_121_last_mean_diff'] = train['D_121_last'] - train['D_121_mean']
    train['D_121_last_mean_diff'] = train['D_121_last_mean_diff'].fillna(0)
                  
    train['D_110_last_mean_diff'] = train['D_110_last'] - train['D_110_mean']
    train['D_110_last_mean_diff'] = train['D_110_last_mean_diff'].fillna(0)
                  
    train['D_141_last_mean_diff'] = train['D_141_last'] - train['D_141_mean']
    train['D_141_last_mean_diff'] = train['D_141_last_mean_diff'].fillna(0)
                  
    train['D_144_last_mean_diff'] = train['D_144_last'] - train['D_144_mean']
    train['D_144_last_mean_diff'] = train['D_144_last_mean_diff'].fillna(0)
                  
    train['S_24_last_mean_diff'] = train['S_24_last'] - train['S_24_mean']
    train['S_24_last_mean_diff'] = train['S_24_last_mean_diff'].fillna(0)
                  
    train['D_50_last_mean_diff'] = train['D_50_last'] - train['D_50_mean']
    train['D_50_last_mean_diff'] = train['D_50_last_mean_diff'].fillna(0)
                  
    train['B_6_last_mean_diff'] = train['B_6_last'] - train['B_6_mean']
    train['B_6_last_mean_diff'] = train['B_6_last_mean_diff'].fillna(0)
                  
    train['B_10_last_mean_diff'] = train['B_10_last'] - train['B_10_mean']
    train['B_10_last_mean_diff'] = train['B_10_last_mean_diff'].fillna(0)
                  
    train['D_53_last_mean_diff'] = train['D_53_last'] - train['D_53_mean']
    train['D_53_last_mean_diff'] = train['D_53_last_mean_diff'].fillna(0)
                  
    train['B_28_last_mean_diff'] = train['B_28_last'] - train['B_28_mean']
    train['B_28_last_mean_diff'] = train['B_28_last_mean_diff'].fillna(0)
                  
    train['S_22_last_mean_diff'] = train['S_22_last'] - train['S_22_mean']
    train['S_22_last_mean_diff'] = train['S_22_last_mean_diff'].fillna(0)
                  
    train['B_3_last_mean_diff'] = train['B_3_last'] - train['B_3_mean']
    train['B_3_last_mean_diff'] = train['B_3_last_mean_diff'].fillna(0)
                  
    train['D_56_last_mean_diff'] = train['D_56_last'] - train['D_56_mean']
    train['D_56_last_mean_diff'] = train['D_56_last_mean_diff'].fillna(0)
                  
    train['D_130_last_mean_diff'] = train['D_130_last'] - train['D_130_mean']
    train['D_130_last_mean_diff'] = train['D_130_last_mean_diff'].fillna(0)
                  
    train['S_7_last_mean_diff'] = train['S_7_last'] - train['S_7_mean']
    train['S_7_last_mean_diff'] = train['S_7_last_mean_diff'].fillna(0)    
    
    print('Last-Mean Features Done')
                       
    ###################################################################################
                  
    Final_drop = ['R_14_last', 'S_16_last', 'B_28_last', 'D_58_last', 'D_48_last', 'D_55_last', 'D_88_last', 
             'D_69_last', 'S_5_last','D_44_last','P_3_last','D_131_last','B_37_last','D_144_last',
             'D_102_last','B_8_last','S_17_last','S_22_last','D_130_last','D_73_last','B_5_first', 'D_50_first', 
             'SDist_first', 'D_41_first', 'D_144_first','P_2_min','P_2_max',
             'D_48_first', 'R_12_first','B_18_first', 'B_8_first', 'B_3_first', 'S_5_first', 'D_112_first', 
             'D_58_first', 'D_69_first', 'B_15_first','D_41_mean', 'D_144_mean', 'R_14_mean', 'S_17_mean', 
            'B_23_mean', 'D_56_mean', 'S_22_mean', 'B_10_mean', 'B_3_mean', 'B_1_mean', 'B_28_mean', 'B_6_mean',
            'D_131_mean', 'S_24_mean', 'D_46_mean', 'D_58_mean', 'D_53_mean', 'S_7_mean',
           "B_30_last","D_114_nunique","D_117_nunique","D_126_last","D_64_nunique"]    
    
    train.drop(Final_drop,axis=1,inplace=True) 
    
    print("Dropping Features Done")
                  
    ###################################################################################  
    
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    train.loc[:,num_cols] = train.loc[:,num_cols].progress_apply(lambda x: x.astype(np.float16))
    
    nan_col = ['D_50_mean', 'D_50_last', 'S_23_std', 'S_23_last']
    train[nan_col] = train[nan_col].fillna(-32783)
    train = train.fillna(0)
    
    ## Finally add NaN counts from earlier
    train["total_data_count"] = nan_sum
    train["total_data_last"] = nan_last

    train = train.loc[:,~train.columns.duplicated()]
    
    print('featuring2 done')
    
    print(f'Final shape after engineering{train.shape}')
    
    return train 

In [6]:
%%time
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = getAndProcessDataInChunks(TRAIN_PATH, is_train=True, NUM_PARTS=TRAIN_NUM_PARTS, verbose='train')

Reading customer_IDs from train data..
shape of data: (5531451, 2)
We will process train data as 10 separate parts.
There will be 45891 customers in each part (except the last part).
Below are number of rows in each part:
[553403, 552855, 554025, 554330, 552004, 552378, 552822, 553151, 553493, 552990]

Reading train data...
shape of data: (5531451, 190)
=> train part 1 has shape (553403, 190)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyd

Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 2 has shape (552855, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 3 has shape (554025, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 4 has shape (554330, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 5 has shape (552004, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 6 has shape (552378, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 7 has shape (552822, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 8 has shape (553151, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 9 has shape (553493, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45891, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45891, 770)
|...
 ...|
Targetslice Merged
Appended
=> train part 10 has shape (552990, 190)
Delta features Done


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

shape after engineering1 (45894, 691)
Aggregation Done
Encoding done
Rounding done
Miscellaneous features Done
Last-First Features Done
Last-Mean Features Done
Dropping Features Done


  0%|          | 0/606 [00:00<?, ?it/s]

featuring2 done
Final shape after engineering(45894, 770)
|...
 ...|
Targetslice Merged
Appended
...|
CPU times: user 7min 23s, sys: 2min 41s, total: 10min 5s
Wall time: 9min 59s


In [7]:
train.to_feather('770_FE_train.feather')

In [8]:
train.shape

(458913, 772)

In [9]:
train.sample(10)

Unnamed: 0,customer_ID,B_30_nunique,B_38_last,B_38_nunique,D_114_last,D_117_last,D_120_last,D_120_nunique,D_66_last,D_66_nunique,...,D_53_last_mean_diff,B_28_last_mean_diff,S_22_last_mean_diff,B_3_last_mean_diff,D_56_last_mean_diff,D_130_last_mean_diff,S_7_last_mean_diff,total_data_count,total_data_last,target
61804,-6757709528909724663,1,1,2,1,5,1,1,0,1,...,0.0,-0.010345,-0.012169,0.01107,0.0,0.0,-0.034821,2244.0,173.0,0
76284,-6174501664152935909,1,3,2,2,3,1,1,0,1,...,-0.00441,-0.03064,-0.023834,0.047394,0.0,-0.004875,-0.102966,2302.0,174.0,0
30495,-8009497906596940905,1,2,2,2,1,1,1,0,1,...,0.0,0.017242,0.002497,-0.000922,0.0233,0.0,-0.029007,2266.0,174.0,0
149592,-3215322069827706300,1,2,2,1,5,1,2,0,1,...,0.0,-0.011002,-0.04718,0.000329,0.0,0.0,0.042725,2205.0,174.0,0
388557,6391275172642931414,1,1,2,2,5,1,1,0,1,...,0.0,-0.060181,0.000299,0.003267,0.0,0.0,0.0,2210.0,170.0,0
450148,8867876255755754957,1,2,1,2,1,1,1,0,1,...,0.0,-0.011436,-0.001001,0.002035,0.001773,0.0,-0.008743,2241.0,171.0,0
82841,-5904627147170729622,2,5,4,1,1,1,2,0,1,...,0.0,0.002699,0.003777,0.00766,0.0,-0.000429,0.026031,1008.0,176.0,0
315194,3448257728563060967,2,4,2,1,3,2,2,0,1,...,0.0,0.007362,0.038879,0.125854,0.0,0.0,0.104126,1012.0,175.0,0
346476,4703376064554275968,1,1,1,2,1,1,1,0,1,...,0.0,-0.096008,0.002462,-0.003256,0.0,0.0,0.0,2210.0,170.0,0
434834,8248329599985662910,1,1,1,1,4,1,1,0,1,...,0.0,-0.002655,0.0006,-0.001655,0.0,0.0,0.0,2126.0,163.0,0


In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 772 entries, customer_ID to target
dtypes: float16(606), float64(2), int16(20), int32(6), int64(8), int8(130)
memory usage: 650.4 MB
