In [1]:
import numpy   as np # linear algebra
import pandas  as pd # data processing
import os.path as op # file system access
import os      as os
import gc      as gc
import time    as ti

# Data Loading and Cleanup

In [2]:
def compress(df, verbose = True) :

    smu = df.memory_usage().sum() / 1024**2 / 8
    con = {'f' : {                                   np.finfo(np.float16).max : np.float16, np.finfo(np.float32).max : np.float32, np.finfo(np.float64).max : np.float64},
           'u' : {np.iinfo(np.uint8).max : np.uint8, np.iinfo(np.uint16).max  : np.uint16,  np.iinfo(np.uint32).max  : np.uint32,  np.iinfo(np.uint64).max  : np.uint64},
           'i' : {np.iinfo(np.int8).max  : np.int8,  np.iinfo(np.int16).max   : np.int16,   np.iinfo(np.int32).max   : np.int32,   np.iinfo(np.int64).max   : np.int64}}

    for c in df.columns :
        if  con.get(df[c].dtype.kind) :
            df[c] = df[c].astype(con[df[c].dtype.kind].get(min((n for n in con[df[c].dtype.kind].keys() if n > max(df[c].max(), abs(df[c].min()))))))

    emu = df.memory_usage().sum() / 1024**2 / 8

    if  verbose :
        print(f'Memory Use Decreased to {emu:5.2f} MB [{100 * (smu - emu) / emu:5.1f}% Reduction]')

    return df, 100 * (smu - emu) / emu

def read(csv_path, dates = [], brize = [], dummy = [], delna = False, index = None, repkl = False) :
    
    pkl_path = csv_path.replace('.csv', '.pkl').replace('../input', '../pickle')
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not repkl :

        df       = pd.read_pickle(pkl_path)
        csv_path = pkl_path
        rp       = 0.0

    else                    :

        df = pd.read_csv(csv_path, parse_dates = dates, memory_map = True)
        
        if  index :
            df = df.set_index(index)
            
        if  delna :
            df = df.dropna()

        for c in brize :
            df[c] = df[c].eq('Y').mul(1)
    
        if  dummy :
            df = pd.get_dummies(df, columns = dummy)
        
        try    : df, rp = compress(df, verbose = False)
        except :     rp = -1

        os.makedirs('../pickle', exist_ok = True)
        df.to_pickle(pkl_path)

    print(f'Loaded : {op.basename(csv_path):>29} in {ti.time()-srt_time:5.1f} Seconds, Shape is {str(df.shape):>14}, Memory Usage is {df.memory_usage().sum() / 1024**2 / 8:6.2f} MB [Reduction of {rp:5.1f} %].')

    return df

def load() :

    data           = {}

    data['train' ] = read('../input/train.csv', dates = ['first_active_month'],
                                                delna = True,
                                                index = 'card_id',
                                                repkl = True)
    data['test'  ] = read('../input/test.csv',  dates = ['first_active_month'],
                                                delna = True,
                                                index = 'card_id',
                                                repkl = True)

    data['mercs' ] = read('../input/merchants.csv', index = 'merchant_id',
                                                    repkl = True)

    data['txnew' ] = read('../input/new_merchant_transactions.csv', dates = ['purchase_date'],
                                                                    brize = ['authorized_flag', 'category_1'],
                                                                    dummy = ['category_2', 'category_3'],
                                                                    index = None)
    data['txold' ] = read('../input/historical_transactions.csv',   dates = ['purchase_date'],
                                                                    brize = ['authorized_flag', 'category_1'],
                                                                    dummy = ['category_2', 'category_3'],
                                                                    index = None)

    data['target'] = data['train'].pop('target')

    return data


In [3]:
data = load()

Loaded :                     train.csv in   0.7 Seconds, Shape is    (201917, 5), Memory Usage is   0.51 MB [Reduction of 128.6 %].
Loaded :                      test.csv in   0.3 Seconds, Shape is    (123622, 4), Memory Usage is   0.28 MB [Reduction of 110.5 %].
Loaded :                 merchants.csv in   4.0 Seconds, Shape is   (334696, 21), Memory Usage is   5.71 MB [Reduction of  -1.0 %].
Loaded : new_merchant_transactions.pkl in   0.5 Seconds, Shape is  (1963031, 20), Memory Usage is  10.53 MB [Reduction of   0.0 %].
Loaded :   historical_transactions.pkl in   5.6 Seconds, Shape is (29112361, 20), Memory Usage is 163.11 MB [Reduction of   0.0 %].


# Feature Engineering

In [4]:
def draft(data) :

    np.random.seed(0)
    
    plan                           = {}
    plan['first_active_month_max'] = max(data['train'].first_active_month.max(),
                                         data['test' ].first_active_month.max())
    plan['purchase_date_max'     ] = max(data['txold'].purchase_date.max(),
                                         data['txnew'].purchase_date.max())
    plan['purchase_date_ref'     ] =     data['txold'].purchase_date.max()
    plan['train_size'            ] = len(data['train'])
    plan['train_pcnt'            ] = 0.8
    plan['train_mask'            ] = np.random.rand(plan['train_size']) < plan['train_pcnt']
    plan['devel_mask'            ] =               ~plan['train_mask']

    data['y_train'               ] = data['target'][plan['train_mask']]
    data['y_devel'               ] = data['target'][plan['devel_mask']]
    
    return data, plan

In [5]:
data,\
plan = draft(data)

In [6]:
def engineer_transactions(tf, prefix, repkl = False) :

    pkl_path = f'../pickle/engineered_{prefix}_transactions.pkl'
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not repkl :

        df = pd.read_pickle(pkl_path)
        
    else :

      # recover purchase history by denormolizing
        tf['month_diff'    ] = (plan['purchase_date_ref'] - tf['purchase_date']) // np.timedelta64(1, 'M') \
                             + tf['month_lag']

      # extract purchase_month from date
        tf['purchase_month'] = tf['purchase_date'].dt.month

      # convert datetime to numerical
        tf['purchase_ndate'] = tf['purchase_date'].astype(np.int64) * 1e-9

        aggregations = \
        {
            'category_1'           : ['mean', 'sum'],

            'category_2_1.0'       : ['mean'],
            'category_2_2.0'       : ['mean'],
            'category_2_3.0'       : ['mean'],
            'category_2_4.0'       : ['mean'],
            'category_2_5.0'       : ['mean'],
            'category_3_A'         : ['mean'],
            'category_3_B'         : ['mean'],
            'category_3_C'         : ['mean'],

            'merchant_id'          : ['nunique'],
            'merchant_category_id' : ['nunique'],
            'state_id'             : ['nunique'],
            'city_id'              : ['nunique'],
            'subsector_id'         : ['nunique'],

            'purchase_amount'      : ['mean', 'sum', 'max', 'min', 'std'],
            'installments'         : ['mean', 'sum', 'max', 'min', 'std'],
            'purchase_month'       : ['mean',        'max', 'min', 'std'],
            'purchase_ndate'       : [np.ptp,        'max', 'min'       ],
            'month_lag'            : ['mean',        'max', 'min', 'std'],
            'month_diff'           : ['mean'                            ]
        }

        aggregations_kev = \
        {
            'purchase_amount' : ['sum', 'mean', 'median', 'min', 'max', 'std'],
            'subsector_id'    : ['nunique']
        }

      # add aggregations
        df = tf.groupby(['card_id']).agg(aggregations)
        df.columns = ['_'.join((prefix,) + c) for c in df.columns.values]
        df.reset_index(inplace = True)

      # add transaction count
        tc = tf.groupby('card_id').size().reset_index(name = f'{prefix}_transaction_count')
        df = pd.merge(tc, df, on = 'card_id', how = 'left')
        
        os.makedirs('../pickle', exist_ok = True)
        df.to_pickle(pkl_path)
    
    print(f'Engineered : {prefix:>5} Transactions in {ti.time()-srt_time:5.1f} Seconds.')
    
    return df

In [7]:
def engineer_cards(df, aggs, plan, prefix) :

    pkl_path = f'../pickle/engineered_{prefix}_cards.pkl'
    srt_time = ti.time()
    
    if  op.exists(pkl_path) and not repkl :

        df = pd.read_pickle(pkl_path)

    else :
    
      # convert categorical variables to dummy/indicator, preserve original categorical variable
        df = pd.concat([df, pd.get_dummies(df[['feature_1', 'feature_2']],
                                    columns = ['feature_1', 'feature_2'])], axis = 1)

      # normalized active days of card
        df['active_days'] = (plan['first_active_month_max'] - df['first_active_month']).dt.days

      # add purchase amounts of card
      # purchase_amount_old = data['txold'].groupby(['card_id'])['purchase_amount'].sum().to_frame('old_purchase_amount')
      # purchase_amount_new = data['txold'].groupby(['card_id'])['purchase_amount'].sum().to_frame('new_purchase_amount')

      # df = df.join(purchase_amount_old[purchase_amount_old.index.isin(df.index)]).fillna(0)
      # df = df.join(purchase_amount_new[purchase_amount_new.index.isin(df.index)]).fillna(0)

        if  'txnew' in aggs :
            df = pd.merge( df, aggs['txnew'], on = 'card_id', how = 'left')

        if  'txold' in aggs : 
            df = pd.merge( df, aggs['txold'], on = 'card_id', how = 'left')

        os.makedirs('../pickle', exist_ok = True)
        df.to_pickle(pkl_path)
            
    print(f'Engineered : {prefix:>5} Cards        in {ti.time()-srt_time:5.1f} Seconds.')
        
    return df

In [8]:
def engineer(data, plan) :

    aggs          = {}
    aggs['txnew'] = engineer_transactions(data['txnew'], prefix = 'new')
    aggs['txold'] = engineer_transactions(data['txold'], prefix = 'old')

    feat          = {}
    feat['train'] = engineer_cards(data['train'], aggs, plan, prefix = 'train')
    feat['test' ] = engineer_cards(data['test' ], aggs, plan, prefix = 'test' )
  
    del aggs
    d = gc.collect()

    return feat

In [9]:
feat = engineer(data, plan)

Engineered :   new Transactions in   0.1 Seconds.
Engineered :   old Transactions in 133.3 Seconds.
Engineered : train Cards        in   1.8 Seconds.
Engineered :  test Cards        in   1.2 Seconds.


# Baseline

## Linear Regression Model

In [10]:
def regression(df, features, plan, baseline = False) :
    
    from sklearn.linear_model  import LinearRegression
    from sklearn.metrics       import mean_squared_error
    
    if  features == None :
        features  = df._get_numeric_data().columns

    x_train = df[features][plan['train_mask']].fillna(0)
    x_devel = df[features][plan['devel_mask']].fillna(0)

    model   = LinearRegression() \
                .fit(x_train, data['y_train'])

    y_pred  = model.predict(x_devel)
    
    if  baseline :
        plan['baseline_mse'] = mean_squared_error(data['y_devel'], y_pred)

    mse     = mean_squared_error(data['y_devel'], y_pred)
    gain    = (plan['baseline_mse'] - mse) / plan['baseline_mse'] * 100
    
    print(f"Linear Regression : Mean Squared Error is {mse:6.3f} [{gain:+6.3f}]")
    
    return plan

In [11]:
def regression_run(feat, plan) :

    regression(feat['train'], features = ['feature_1', 'feature_2', 'feature_3'], plan = plan, baseline = True ) # base features in train          - baseline
    regression(feat['train'], features = None,                                    plan = plan, baseline = False) # all numeric engineered features - kitchen sink

    return plan

plan = regression_run(feat, plan)

Linear Regression : Mean Squared Error is 14.269 [+0.000]
Linear Regression : Mean Squared Error is 14.811 [-3.803]


## Linear Lasso Model

In [24]:
def lasso(df, features, plan, alphas) :

    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import KFold
    from sklearn.linear_model    import Lasso
    from sklearn.metrics         import mean_squared_error

    if  features == None :
        features  = df._get_numeric_data().columns

    x_train = df[features][plan['train_mask']].fillna(0)
    x_devel = df[features][plan['devel_mask']].fillna(0)

    params  = [{'alpha' : alphas}]
    folds   = 5

    model   = Lasso(random_state = 0)
    grid    = GridSearchCV(model, params, cv = folds, scoring = 'neg_mean_squared_error') \
                .fit(x_train, data['y_train'])
    y_pred  = grid.best_estimator_.predict(x_devel)
    
    mse     = mean_squared_error(data['y_devel'], y_pred)
    gain    = (plan['baseline_mse'] - mse) / plan['baseline_mse'] * 100
    
    print(f"Lasso             : Mean Squared Error is {mse:6.3f} [{gain:+6.3f}] (Alpha = {grid.best_estimator_.alpha:8.5f})")

In [None]:
def lasso_run(feat, plan) :
    
    lasso(feat['train'], features = ['feature_1', 'feature_2', 'feature_3'], plan = plan, alphas = [0.00001, 0.001, 0.5, 10])
    lasso(feat['train'], features = ['feature_1', 'feature_2', 'feature_3'], plan = plan, alphas = np.logspace(-4, -0.5, 30))
    lasso(feat['train'], features = None,                                    plan = plan, alphas = [0.00001, 0.001, 0.5, 10])

    return plan

plan = lasso_run(feat, plan)

Lasso             : Mean Squared Error is 14.269 [-0.000] (Alpha =  0.00100)
Lasso             : Mean Squared Error is 14.269 [-0.001] (Alpha =  0.00281)


