<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/Different_feature_joining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [None]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode

### Mount the drive 

In [None]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Setup the current path 

In [None]:
!ls

drive  sample_data


In [None]:
#Setting up worksapce directory 
%cd /content/drive/MyDrive/data 

/content/drive/MyDrive/data


In [None]:
!ls

 augmented_test.csv			  new_merchant_df.pkl
 augmented_train.csv			  new_merchant_transactions.csv
'Data Dictionary.xlsx'			  sample_submission.csv
 Data_Dictionary.xlsx			  test.csv
 historical_transactions.csv		  train.csv
 historical_transactions_df.pkl		  transactions_raw_merged.csv
 merchants.csv				  transactions_refined_1_merged_WON.csv
 merged_transactions_with_merchants.csv


## Timer 

In [None]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI
@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

## Memory Reduction 

In [None]:
#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## One hot encoder 

In [None]:
#Refer:-https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
def one_hot_encoder(df, nan_as_category=True):
    """used to create the one hot encoding of the categorical variables """
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    print("The columns on which one hot encoding is performed is ",categorical_columns)
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

## FE on Train Test

In [None]:
def train_test(num_rows=None):

    # load csv
    train_df = pd.read_csv('/content/drive/MyDrive/data/train.csv', index_col=['card_id'], nrows=num_rows)
    test_df = pd.read_csv('/content/drive/MyDrive/data/test.csv', index_col=['card_id'], nrows=num_rows)

    print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))
    OUTLIER_THRESHOLD = 30
    # Create an outliers column set to 1 for
    train_df['outliers'] = np.where(train_df['target'] < OUTLIER_THRESHOLD, 1, 0)

    # set target as nan
    test_df['target'] = np.nan

    # merge
    df = train_df.append(test_df)

    del train_df, test_df
    gc.collect()

    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['quarter_first_active_month'] = df['first_active_month'].dt.quarter
    df['first_active_month_diff_from_today'] = (datetime.datetime.today() - df['first_active_month']).dt.days


    # one hot encoding
    df, cols = one_hot_encoder(df, nan_as_category=False)

    for col in ['feature_1', 'feature_2', 'feature_3']:
        order_label = df.groupby(col)['outliers'].mean()
        df[col] = df[col].map(order_label)

    # Some basic statistics transformations over the feature_i columns
    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum'] / 3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_std'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)

    return df

## Imputations on transactions data 

In [None]:
def transactions_imputations(df):
    """
    Data preprocessing consists of 
    1. Handling missing values 
    2. Mapping of categorical columns
    """

    # fillna
    df['category_2'].fillna(6, inplace=True)
    df['category_3'].fillna(3, inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
    df['installments'].replace(-1, np.nan, inplace=True)
    df['installments'].replace(999, np.nan, inplace=True)
    df['installments'].fillna(df['installments'].mode()[0], inplace=True)



    # mapping categorical to numerical 
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2, 3:3}).astype(int)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)
    #removing purchase amount outliner    
    df['purchase_amount'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    df['price'] = df['purchase_amount'] / (df['installments'] + 0.001) #some epsilone for 0  installments 

    # reduce memory usage
    #df = reduce_mem_usage(df)

    return df


## Imputations on Merchant Data

In [None]:
def merchant_imputations(df):
    """
    Data preprocessing consists of 
    1. Handling missing values 
    2. Mapping of categorical columns
    3. Handling INF values 
    """


    #drop duplicate merchant id
    df.drop_duplicates(subset=['merchant_id'], keep='last')

    df['category_1'] = df['category_1'].fillna(2)
    df['category_1'] = df['category_1'].map({'Y': 0, 'N': 1,2:2}).astype(int)

    #for most_recent_sales_range_merchants_t_merchants_t and most_recent_purchases_range_merchants_t_merchants_t
    #it has A,B,C,D,E and null values 
    df['most_recent_sales_range'] = df['most_recent_sales_range'].fillna(5)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].fillna(5)
    df['most_recent_sales_range'] = df['most_recent_sales_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    

    #for category_4_merchants_t_merchants_t
    #it has Y,N and null values 
    df['category_4'] = df['category_4'].fillna(2)
    df['category_4'] = df['category_4'].map({'Y': 0, 'N': 1, 2 : 2}).astype(int)

    df['category_2'] = df['category_2'].fillna(6)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)

    #for missing id we will use -1111 as filling value 
    #find the number of missing values 
    for col in df[['merchant_group_id','state_id',
                            'merchant_category_id','subsector_id','city_id']]:
                            df[col] = df[col].fillna(-1111)

    #imputing the inf values with max 
    features_inf = ["avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"]
    for col in features_inf:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])

    df.columns = [col+"_merchants_t" if col!="merchant_id" else col for col in df.columns]

    # reduce memory usage
    #df = reduce_mem_usage(df)

    return df

## FE and Grouping on card id 

In [None]:
def group_on_card_id(df):
    """
    FUNCTION:
    To group the data on card id  

    NOTE:
    This is for without merchant data 

    ARGS:
    df is the data frame on which grouping needs to be performed 

    RETURNS:
    returns the data frame after grouping 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']


    mode = lambda x: x.value_counts().index[0]

    agg_fun = {
        
    "authorized_flag": ['sum', 'mean',mode],
    'category_1' : ['sum', 'mean',mode],
    'category_2' :['sum', 'mean',mode],
    'category_3' :['sum', 'mean',mode],


    'city_id' : ['nunique',mode],
    'state_id' : ['nunique',mode],
    'subsector_id' : ['nunique',mode],
    'merchant_category_id' : ['nunique',mode],
    'merchant_id': ['nunique',mode],


    'month_lag' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'installments' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'purchase_amount' : ['sum', 'mean', 'min', 'max', 'std','var'],


    'weekend': ['nunique', 'mean', 'min', 'max'],
    'weekday' : ['nunique', 'mean', 'min', 'max'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'mean', 'min', 'max'],
    #Refer:-https://numpy.org/doc/stable/reference/generated/numpy.ptp.html
    'purchase_date' : [np.ptp, 'min', 'max'],
    'month' : ['sum', 'mean', 'nunique']
    }

    df = df.groupby("card_id",as_index=False).agg(agg_fun)
    print(df.card_id)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

In [None]:
def group_on_card_id_withMer1(df):
    """
    FUNCTION:
    To group the data on card id  

    ARGS:
    df is the data frame on which grouping needs to be performed 

    RETURNS:
    returns the data frame after grouping 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']


      
    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    agg_fun = {
        
    "authorized_flag": ['sum', 'mean',mode],
    'category_1' : ['sum', 'mean',mode],
    'category_2' :['sum', 'mean',mode],
    'category_3' :['sum', 'mean',mode],
    'category_1_merchants_t' :['sum', 'mean',mode],
    'most_recent_sales_range_merchants_t' :['sum', 'mean',mode],
    'category_4_merchants_t' :['sum', 'mean',mode],
    'most_recent_purchases_range_merchants_t' :['sum', 'mean',mode],
    'category_2_merchants_t' :['sum', 'mean',mode],


    'city_id' : ['nunique',mode],
    'state_id' : ['nunique',mode],
    'subsector_id' : ['nunique',mode],
    'merchant_category_id' : ['nunique',mode],
    'merchant_id': ['nunique',mode],
    'merchant_group_id_merchants_t':['nunique',mode],
    'state_id_merchants_t':['nunique',mode],
    'merchant_category_id_merchants_t':['nunique',mode],
    'subsector_id_merchants_t':['nunique',mode],
    'city_id_merchants_t':['nunique',mode],


    'month_lag' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'installments' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'purchase_amount' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'numerical_2_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'numerical_1_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],

    'active_months_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],


    'weekend': ['nunique', 'mean', 'min', 'max'],
    'weekday' : ['nunique', 'mean', 'min', 'max'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'mean', 'min', 'max'],
    #Refer:-https://numpy.org/doc/stable/reference/generated/numpy.ptp.html
    'purchase_date' : [np.ptp, 'min', 'max'],
    'month' : ['sum', 'mean', 'nunique']
    }

    df = df.groupby("card_id",as_index=False).agg(agg_fun)
    print(df.card_id)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

## Generating the aurgumentated dataset 

In [None]:
def generate_augmented_train_test(debug=False):
    """ 
    Generate train and test augmented datasets. 
    STEPS:-
    1. Load Historical Transactions
    2. Impute Historical Transactions (clean)
    3. Load New Merchant Transactions  
    4. Impute New Merchant Transactons (clean)
    5. Load merchant data 
    6. Impute the merchant data(clean)
    7. merge the historical with merchants (left)
    8. merge the new merchants with merchants (left)
    9. Grouped On card id(group_on_card_id_withMer1)



    DOCUMENTATIONS 
    NUM_ROWS = 1000
    UNIQUE CARD ID

    JOIN:- INNER
    BEFORE 76,5
    AFTER 0,0

    JOIN:- LEFT
    BEFORE 76,5
    AFTER 76,5

    JOIN:- OUTER
    BEFORE 76,5
    AFTER 76,5
    CHOOSE LEFT DUE TO LESS NAN VALUES 
    """
    num_rows = 1000 if debug else None
    with timer("1ST HALF"):

        # load csv
        print("[INFO] LOADING NEW MERCHANT TRANSACTIONS AND HISTORICAL TRANSACTIONS ....")
        new_merchant_df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=num_rows)
        historical_transactions_df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=num_rows)
        print("[INFO] MERGING THE HIST TRANSACTIONS DATA WITH MERCHANTS  ....")

        #Do the imputation 
        print("[INFO] DOING THE TRANSACTIONS IMPUTATIONS ...")
        new_merchant_df = transactions_imputations(new_merchant_df)
        historical_transactions_df = transactions_imputations(historical_transactions_df)
        print("[INFO] DATA CLEANING DONE ...")

        #unique values of column of card id 
        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())

        #load csv 
        print("[INFO] LOADING MERCHANTS ...")
        merchant_df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)

        #do imputations 
        print("[INFO] DOING MERCHANT DATA IMPUTATIONS ...")
        merchant_df = merchant_imputations(merchant_df)


        #Merge the data frames 
        print("[INFO] MERGING THE DATA FRAMES ...")
        historical_transactions_df = pd.merge(historical_transactions_df, merchant_df, on='merchant_id', how = 'left')
        new_merchant_df = pd.merge(new_merchant_df, merchant_df, on='merchant_id',how = 'left')

        #DEL merchant_df
        del merchant_df

        #printing the data frames 
        print("[INFO] THE MODIFIED HISTORICAL TRANSACTIONS ...")
        print(historical_transactions_df.columns)
        print("[INFO] THE MOFIFIED NEW MERCHANTS TRANSACTIONS ...")
        print(new_merchant_df.columns)


        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        #unique values of column of card id 
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())


        #Group on card id
        print("[INFO] GROUPING ON CARD ID ...")
        new_merchant_df = group_on_card_id_withMer1(new_merchant_df)
        historical_transactions_df = group_on_card_id_withMer1(historical_transactions_df)

        #printing the data frames 
        print("[INFO] THE MODIFIED HISTORICAL TRANSACTIONS ...")
        print(historical_transactions_df.columns)
        print("[INFO] THE MOFIFIED NEW MERCHANTS TRANSACTIONS ...")
        print(new_merchant_df.columns)


        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        #unique values of column of card id 
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())

        return historical_transactions_df,new_merchant_df



## Main Method

In [None]:
if __name__ == "__main__":
  historical_transactions_df,new_merchant_df = generate_augmented_train_test()

[INFO] LOADING NEW MERCHANT TRANSACTIONS AND HISTORICAL TRANSACTIONS ....
[INFO] MERGING THE HIST TRANSACTIONS DATA WITH MERCHANTS  ....
[INFO] DOING THE TRANSACTIONS IMPUTATIONS ...
[INFO] DATA CLEANING DONE ...
[INFO] THE UNIQUE VALUES OF CARD ID ...
FOR NEW MERCHANT  290001
FOR HISTORICAL TRANSACTIONS  325540
[INFO] LOADING MERCHANTS ...
[INFO] DOING MERCHANT DATA IMPUTATIONS ...
[INFO] MERGING THE DATA FRAMES ...
[INFO] THE MODIFIED HISTORICAL TRANSACTIONS ...
Index(['authorized_flag', 'card_id', 'city_id', 'category_1', 'installments',
       'category_3', 'merchant_category_id', 'merchant_id', 'month_lag',
       'purchase_amount', 'purchase_date', 'category_2', 'state_id',
       'subsector_id', 'price', 'merchant_group_id_merchants_t',
       'merchant_category_id_merchants_t', 'subsector_id_merchants_t',
       'numerical_1_merchants_t', 'numerical_2_merchants_t',
       'category_1_merchants_t', 'most_recent_sales_range_merchants_t',
       'most_recent_purchases_range_mercha

In [None]:
3941//60

65

In [None]:
!ls

 augmented_test.csv			  new_merchant_transactions.csv
 augmented_train.csv			  sample_submission.csv
'Data Dictionary.xlsx'			  test.csv
 Data_Dictionary.xlsx			  train.csv
 historical_transactions.csv		  transactions_raw_merged.csv
 merchants.csv				  transactions_refined_1_merged_WON.csv
 merged_transactions_with_merchants.csv


In [None]:
historical_transactions_df.to_pickle("historical_transactions_df.pkl")
new_merchant_df.to_pickle("new_merchant_df.pkl")

In [None]:
!ls

 augmented_test.csv			  new_merchant_df.pkl
 augmented_train.csv			  new_merchant_transactions.csv
'Data Dictionary.xlsx'			  sample_submission.csv
 Data_Dictionary.xlsx			  test.csv
 historical_transactions.csv		  train.csv
 historical_transactions_df.pkl		  transactions_raw_merged.csv
 merchants.csv				  transactions_refined_1_merged_WON.csv
 merged_transactions_with_merchants.csv


In [None]:
new_merchant_df = pd.read_pickle("new_merchant_df.pkl")
historical_transactions_df = pd.read_pickle("historical_transactions_df.pkl")

In [None]:
new_merchant_df.sample(10)

Unnamed: 0_level_0,card_id,authorized_flag,authorized_flag,authorized_flag,category_1,category_1,category_1,category_2,category_2,category_2,category_3,category_3,category_3,category_1_merchants_t,category_1_merchants_t,category_1_merchants_t,most_recent_sales_range_merchants_t,most_recent_sales_range_merchants_t,most_recent_sales_range_merchants_t,category_4_merchants_t,category_4_merchants_t,category_4_merchants_t,most_recent_purchases_range_merchants_t,most_recent_purchases_range_merchants_t,most_recent_purchases_range_merchants_t,category_2_merchants_t,category_2_merchants_t,category_2_merchants_t,city_id,city_id,state_id,state_id,subsector_id,subsector_id,merchant_category_id,merchant_category_id,merchant_id,merchant_id,merchant_group_id_merchants_t,merchant_group_id_merchants_t,...,numerical_1_merchants_t,numerical_1_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,weekend,weekend,weekend,weekend,weekday,weekday,weekday,weekday,hour,hour,hour,hour,weekofyear,weekofyear,weekofyear,weekofyear,day,day,day,day,purchase_date,purchase_date,purchase_date,month,month,month
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,...,var,skew,sum,mean,min,max,var,skew,sum,mean,min,max,var,skew,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,ptp,min,max,sum,mean,nunique
236720,C_ID_d116f1237e,10,1,1,1,0.099976,0,15,1.5,1,10,1.0,1,7,0.700195,1,16,1.599609,1,7,0.700195,1,17,1.700195,2,25,2.5,1,4,250,2,9,6,27,6,80,9,M_ID_00a6ca8a8a,8,35,...,0.103638,3.160156,60,6.0,6,6,0.0,0.0,115,11.5,7,12,2.5,-3.162109,2,0.399902,0,1,6,3.300781,0,6,8,16.40625,13,23,3,15.898438,15,17,7,18.59375,13,28,15 days 10:34:47,2018-04-13 13:17:29,2018-04-28 23:52:16,40,4.0,1
230154,C_ID_cb4d767ff0,16,1,1,0,0.0,0,41,2.5625,1,0,0.0,0,16,1.0,1,36,2.25,2,7,0.4375,0,34,2.125,3,36,2.25,1,7,69,3,9,9,37,12,80,16,M_ID_2265fc4bf5,15,35,...,0.208008,3.734375,96,6.0,6,6,0.0,0.0,188,11.75,8,12,1.0,-4.0,2,0.25,0,1,6,3.5625,1,6,12,12.5,5,21,3,10.4375,9,13,8,7.75,1,29,31 days 07:43:45,2018-03-01 10:17:47,2018-04-01 18:01:32,50,3.125,2
101750,C_ID_59dbe74984,10,1,1,0,0.0,0,10,1.0,1,11,1.099609,1,9,0.899902,1,19,1.900391,2,7,0.700195,1,18,1.799805,2,15,1.5,1,4,153,2,15,7,33,7,705,10,M_ID_2c0e14b445,9,35,...,0.04599,2.136719,60,6.0,6,6,0.0,0.0,120,12.0,12,12,0.0,0.0,2,0.600098,0,1,6,3.699219,0,6,6,12.703125,8,19,5,7.800781,6,13,8,10.101562,3,29,52 days 09:43:44,2018-02-05 09:57:27,2018-03-29 19:41:11,24,2.400391,2
20097,C_ID_11e39e5b20,5,1,1,1,0.199951,0,10,2.0,1,5,1.0,1,4,0.799805,1,12,2.400391,2,0,0.0,0,11,2.199219,2,10,2.0,1,2,69,2,9,5,2,5,80,5,M_ID_229817b3ed,5,35,...,0.481689,2.236328,30,6.0,6,6,0.0,0.0,60,12.0,12,12,0.0,0.0,2,0.600098,0,1,3,4.199219,0,6,4,12.203125,8,20,3,6.800781,3,8,4,21.0,15,25,34 days 20:20:04,2018-01-21 12:54:36,2018-02-25 09:14:40,9,1.799805,2
255236,C_ID_e179132dcf,2,1,1,0,0.0,0,2,1.0,1,0,0.0,0,2,1.0,1,2,1.0,0,1,0.5,0,2,1.0,0,2,1.0,1,2,69,1,9,2,17,2,606,2,M_ID_5eee1a8d36,1,35,...,0.830566,,12,6.0,6,6,0.0,,22,11.0,10,12,2.0,,1,0.0,0,0,1,4.0,4,4,2,6.0,0,12,1,46.0,46,46,1,17.0,17,17,0 days 12:51:03,2017-11-17 00:00:00,2017-11-17 12:51:03,22,11.0,1
130100,C_ID_72dc560ad2,7,1,1,1,0.142822,0,12,1.713867,1,13,1.857422,1,6,0.856934,1,11,1.571289,0,2,0.285645,0,13,1.857422,2,12,1.713867,1,3,69,2,9,6,21,7,309,7,M_ID_089f57b2c6,7,35,...,47.3125,2.642578,42,6.0,6,6,0.0,0.0,84,12.0,12,12,0.0,0.0,2,0.571289,0,1,5,4.0,0,6,7,11.570312,0,18,4,15.0,12,17,5,26.28125,22,31,36 days 21:27:41,2018-03-23 16:00:23,2018-04-29 13:28:04,25,3.572266,2
112260,C_ID_6318fdf637,3,1,1,0,0.0,0,9,3.0,3,0,0.0,0,3,1.0,1,7,2.333984,2,2,0.666504,1,8,2.666016,3,9,3.0,3,3,170,2,8,1,19,1,307,3,M_ID_2ca8ed0040,3,36174,...,3.3e-05,1.732422,18,6.0,6,6,0.0,0.0,36,12.0,12,12,0.0,0.0,2,0.666504,0,1,2,4.332031,3,5,2,12.335938,9,19,2,9.335938,9,10,3,4.667969,1,10,9 days 10:04:58,2018-03-01 09:12:15,2018-03-10 19:17:13,9,3.0,1
241339,C_ID_d53028b34e,2,1,1,0,0.0,0,10,5.0,5,2,1.0,1,2,1.0,1,5,2.5,1,2,1.0,1,4,2.0,1,10,5.0,5,2,26,1,21,1,16,1,367,2,M_ID_7e58002561,2,3017,...,0.009636,,12,6.0,6,6,0.0,,24,12.0,12,12,0.0,,1,0.0,0,0,1,3.0,3,3,2,8.5,8,9,1,13.0,13,13,1,29.0,29,29,0 days 01:25:34,2018-03-29 08:23:37,2018-03-29 09:49:11,6,3.0,1
18695,C_ID_109ed6e984,3,1,1,0,0.0,0,3,1.0,1,0,0.0,0,3,1.0,1,3,1.0,0,2,0.666504,1,2,0.666504,0,3,1.0,1,2,213,1,9,3,17,3,307,3,M_ID_88efd72bce,2,35,...,1.579102,1.731445,18,6.0,6,6,0.0,0.0,34,11.335938,10,12,1.333008,-1.732422,1,0.0,0,0,3,1.666992,0,3,3,14.335938,1,22,3,2.0,1,3,3,9.664062,3,15,12 days 01:33:54,2018-01-03 20:47:39,2018-01-15 22:21:33,3,1.0,1
129984,C_ID_72c501220e,3,1,1,0,0.0,0,3,1.0,1,0,0.0,0,2,0.666504,1,6,2.0,1,2,0.666504,1,7,2.333984,1,8,2.666016,1,2,331,1,16,2,33,2,705,3,M_ID_14ca097a4b,3,20648,...,0.000688,1.458008,18,6.0,6,6,0.0,0.0,36,12.0,12,12,0.0,0.0,2,0.333252,0,1,2,3.0,2,5,3,15.0,9,19,2,8.0,7,10,2,12.664062,10,14,23 days 15:59:21,2018-02-14 17:11:07,2018-03-10 09:10:28,7,2.333984,2


In [None]:
historical_transactions_df.sample(10)

Unnamed: 0_level_0,card_id,authorized_flag,authorized_flag,authorized_flag,category_1,category_1,category_1,category_2,category_2,category_2,category_3,category_3,category_3,category_1_merchants_t,category_1_merchants_t,category_1_merchants_t,most_recent_sales_range_merchants_t,most_recent_sales_range_merchants_t,most_recent_sales_range_merchants_t,category_4_merchants_t,category_4_merchants_t,category_4_merchants_t,most_recent_purchases_range_merchants_t,most_recent_purchases_range_merchants_t,most_recent_purchases_range_merchants_t,category_2_merchants_t,category_2_merchants_t,category_2_merchants_t,city_id,city_id,state_id,state_id,subsector_id,subsector_id,merchant_category_id,merchant_category_id,merchant_id,merchant_id,merchant_group_id_merchants_t,merchant_group_id_merchants_t,...,numerical_1_merchants_t,numerical_1_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag6_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,active_months_lag12_merchants_t,weekend,weekend,weekend,weekend,weekday,weekday,weekday,weekday,hour,hour,hour,hour,weekofyear,weekofyear,weekofyear,weekofyear,day,day,day,day,purchase_date,purchase_date,purchase_date,month,month,month
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,sum,mean,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,nunique,mode,...,var,skew,sum,mean,min,max,var,skew,sum,mean,min,max,var,skew,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,nunique,mean,min,max,ptp,min,max,sum,mean,nunique
171362,C_ID_86c2d5b2e7,440,0.977539,1,152,0.337891,0,1775,3.945312,3,488,1.083984,1,266,0.591309,1,526,1.168945,0,21,0.046661,0,596,1.324219,0,1882,4.183594,3,16,25,8,7,28,29,52,839,149,M_ID_e5374dabc0,113,35,...,10.773438,11.359375,2700,6.0,6,6,0.0,0.0,5354,11.898438,7,12,0.479492,-6.773438,2,0.297852,0,1,7,3.189453,0,6,23,13.265625,0,23,41,29.265625,1,52,31,16.59375,1,31,280 days 16:42:45,2017-05-22 17:15:24,2018-02-27 09:58:09,3218,7.152344,10
4010,C_ID_0330e95553,58,0.92041,1,1,0.015869,0,144,2.285156,1,90,1.428711,1,52,0.825195,1,83,1.317383,0,8,0.126953,0,91,1.444336,0,190,3.015625,1,6,69,4,9,18,37,24,367,40,M_ID_820c7b73c8,28,35,...,710.0,5.03125,378,6.0,6,6,0.0,0.0,746,11.84375,7,12,0.780762,-5.472656,2,0.412598,0,1,7,3.380859,0,6,18,14.1875,0,22,20,35.3125,1,52,25,19.5625,1,31,180 days 16:32:16,2017-08-26 17:33:50,2018-02-23 10:06:06,531,8.429688,7
73253,C_ID_397678055c,152,0.956055,1,0,0.0,0,323,2.03125,2,2,0.012581,0,159,1.0,1,184,1.157227,0,159,1.0,1,185,1.163086,0,323,2.03125,2,3,282,2,18,14,33,24,705,41,M_ID_4894c5aa13,32,72792,...,0.00901,6.257812,954,6.0,6,6,0.0,0.0,1908,12.0,12,12,0.0,0.0,2,0.22644,0,1,7,2.9375,0,6,20,15.726562,0,23,45,24.0625,1,52,31,15.960938,1,31,416 days 00:26:53,2017-01-07 14:45:02,2018-02-27 15:11:55,947,5.957031,12
162347,C_ID_7f9d916038,27,0.84375,1,0,0.0,0,32,1.0,1,0,0.0,0,27,0.84375,1,64,2.0,2,27,0.84375,1,70,2.1875,2,57,1.78125,1,3,236,2,15,9,33,11,705,15,M_ID_ae26da131a,9,35,...,0.013626,3.890625,192,6.0,6,6,0.0,0.0,379,11.84375,7,12,0.78125,-5.65625,2,0.15625,0,1,5,2.96875,1,5,13,13.8125,6,21,14,37.25,2,52,11,22.1875,1,31,181 days 04:17:48,2017-08-31 06:30:34,2018-02-28 10:48:22,281,8.78125,7
156860,C_ID_7b469e9a1a,21,0.95459,1,0,0.0,0,82,3.726562,4,28,1.272461,1,20,0.90918,1,53,2.408203,3,4,0.181763,0,58,2.636719,2,92,4.183594,4,3,212,2,4,5,27,10,278,16,M_ID_00a6ca8a8a,14,35,...,0.45752,4.625,132,6.0,6,6,0.0,0.0,259,11.773438,7,12,1.136719,-4.691406,2,0.772949,0,1,5,4.773438,1,6,12,15.632812,0,22,12,27.546875,1,51,12,17.546875,3,31,237 days 18:45:24,2017-06-03 21:07:44,2018-01-27 15:53:08,149,6.773438,7
13939,C_ID_0af98d13b2,43,1.0,1,0,0.0,0,215,5.0,5,44,1.023438,1,41,0.953613,1,48,1.116211,0,0,0.0,0,36,0.837402,0,217,5.046875,5,1,53,1,20,7,33,8,705,11,M_ID_5b82a97cbe,8,926,...,0.518066,1.289062,258,6.0,6,6,0.0,0.0,516,12.0,12,12,0.0,0.0,2,0.023254,0,1,6,2.09375,0,5,16,14.554688,0,23,27,34.875,13,51,26,16.25,2,31,269 days 18:39:50,2017-03-28 19:30:53,2017-12-23 14:10:43,362,8.421875,10
19227,C_ID_0f2922f7e9,386,0.967285,1,0,0.0,0,415,1.040039,1,2,0.005013,0,316,0.791992,1,556,1.393555,0,316,0.791992,1,530,1.328125,0,814,2.041016,1,6,242,2,9,16,33,28,705,50,M_ID_f29b19ac65,29,35,...,85.0,19.75,2394,6.0,6,6,0.0,0.0,4593,11.507812,7,12,2.210938,-2.71875,2,0.300781,0,1,7,3.068359,0,6,23,13.46875,0,23,47,26.984375,1,52,31,16.09375,1,31,415 days 05:49:51,2017-01-09 17:34:43,2018-02-28 23:24:34,2650,6.640625,12
298844,C_ID_eb19a749ba,26,0.866699,1,0,0.0,0,30,1.0,1,41,1.366211,1,26,0.866699,1,46,1.533203,1,26,0.866699,1,42,1.400391,1,50,1.666992,1,2,308,2,16,7,33,8,705,11,M_ID_f066b7ecd2,8,35,...,0.96582,0.703125,180,6.0,6,6,0.0,0.0,360,12.0,12,12,0.0,0.0,2,0.199951,0,1,7,1.799805,0,6,11,14.929688,10,20,18,20.703125,1,48,16,11.664062,1,31,396 days 23:22:02,2017-01-05 17:07:05,2018-02-06 16:29:07,159,5.300781,11
213049,C_ID_a77cc9598b,21,0.65625,1,0,0.0,0,40,1.25,1,32,1.0,1,32,1.0,1,41,1.28125,1,28,0.875,1,41,1.28125,1,32,1.0,1,3,19,2,9,5,33,6,705,10,M_ID_df01eee48d,10,37459,...,0.018967,2.228516,192,6.0,6,6,0.0,0.0,384,12.0,12,12,0.0,0.0,2,0.03125,0,1,6,2.59375,0,5,8,11.40625,7,16,11,26.125,1,52,13,16.59375,1,29,89 days 00:02:24,2017-12-01 11:42:20,2018-02-28 11:44:44,204,6.375,3
15438,C_ID_0c27ecfec0,8,0.571289,1,0,0.0,0,70,5.0,5,19,1.357422,1,14,1.0,1,22,1.571289,1,14,1.0,1,21,1.5,1,70,5.0,5,2,57,2,5,5,27,7,307,8,M_ID_186e74f264,5,35,...,159.875,1.777344,84,6.0,6,6,0.0,0.0,168,12.0,12,12,0.0,0.0,2,0.285645,0,1,6,3.357422,1,6,7,11.789062,8,18,7,24.0,3,51,8,16.0,1,26,73 days 04:43:46,2017-12-10 10:05:21,2018-02-21 14:49:07,83,5.929688,3


In [None]:
historical_transactions_df.columns = ["_".join(col) for col in historical_transactions_df.columns.ravel()]

In [None]:
print(historical_transactions_df.columns)

Index(['card_id_', 'authorized_flag_sum', 'authorized_flag_mean',
       'authorized_flag_mode', 'category_1_sum', 'category_1_mean',
       'category_1_mode', 'category_2_sum', 'category_2_mean',
       'category_2_mode',
       ...
       'day_nunique', 'day_mean', 'day_min', 'day_max', 'purchase_date_ptp',
       'purchase_date_min', 'purchase_date_max', 'month_sum', 'month_mean',
       'month_nunique'],
      dtype='object', length=158)


In [None]:
historical_transactions_df.card_id_

0         C_ID_00007093c1
1         C_ID_0001238066
2         C_ID_0001506ef0
3         C_ID_0001793786
4         C_ID_000183fdda
               ...       
325535    C_ID_ffff1d9928
325536    C_ID_ffff579d3a
325537    C_ID_ffff756266
325538    C_ID_ffff828181
325539    C_ID_fffffd5772
Name: card_id_, Length: 325540, dtype: object

In [None]:
historical_transactions_df.rename(columns={'card_id_':'card_id'},inplace=True)

In [None]:
new_merchant_df.columns = ["_".join(col) for col in new_merchant_df.columns.ravel()]

In [None]:
print(new_merchant_df.columns)

Index(['card_id_', 'authorized_flag_sum', 'authorized_flag_mean',
       'authorized_flag_mode', 'category_1_sum', 'category_1_mean',
       'category_1_mode', 'category_2_sum', 'category_2_mean',
       'category_2_mode',
       ...
       'day_nunique', 'day_mean', 'day_min', 'day_max', 'purchase_date_ptp',
       'purchase_date_min', 'purchase_date_max', 'month_sum', 'month_mean',
       'month_nunique'],
      dtype='object', length=158)


In [None]:
new_merchant_df.card_id

0         C_ID_00007093c1
1         C_ID_0001238066
2         C_ID_0001506ef0
3         C_ID_0001793786
4         C_ID_000183fdda
               ...       
289996    C_ID_ffff1d9928
289997    C_ID_ffff579d3a
289998    C_ID_ffff756266
289999    C_ID_ffff828181
290000    C_ID_fffffd5772
Name: card_id, Length: 290001, dtype: object

In [None]:
new_merchant_df.rename(columns={'card_id_':'card_id'},inplace=True)

In [None]:
new_merchant_df

Unnamed: 0,card_id,authorized_flag_sum,authorized_flag_mean,authorized_flag_mode,category_1_sum,category_1_mean,category_1_mode,category_2_sum,category_2_mean,category_2_mode,category_3_sum,category_3_mean,category_3_mode,category_1_merchants_t_sum,category_1_merchants_t_mean,category_1_merchants_t_mode,most_recent_sales_range_merchants_t_sum,most_recent_sales_range_merchants_t_mean,most_recent_sales_range_merchants_t_mode,category_4_merchants_t_sum,category_4_merchants_t_mean,category_4_merchants_t_mode,most_recent_purchases_range_merchants_t_sum,most_recent_purchases_range_merchants_t_mean,most_recent_purchases_range_merchants_t_mode,category_2_merchants_t_sum,category_2_merchants_t_mean,category_2_merchants_t_mode,city_id_nunique,city_id_mode,state_id_nunique,state_id_mode,subsector_id_nunique,subsector_id_mode,merchant_category_id_nunique,merchant_category_id_mode,merchant_id_nunique,merchant_id_mode,merchant_group_id_merchants_t_nunique,merchant_group_id_merchants_t_mode,...,numerical_1_merchants_t_var,numerical_1_merchants_t_skew,active_months_lag6_merchants_t_sum,active_months_lag6_merchants_t_mean,active_months_lag6_merchants_t_min,active_months_lag6_merchants_t_max,active_months_lag6_merchants_t_var,active_months_lag6_merchants_t_skew,active_months_lag12_merchants_t_sum,active_months_lag12_merchants_t_mean,active_months_lag12_merchants_t_min,active_months_lag12_merchants_t_max,active_months_lag12_merchants_t_var,active_months_lag12_merchants_t_skew,weekend_nunique,weekend_mean,weekend_min,weekend_max,weekday_nunique,weekday_mean,weekday_min,weekday_max,hour_nunique,hour_mean,hour_min,hour_max,weekofyear_nunique,weekofyear_mean,weekofyear_min,weekofyear_max,day_nunique,day_mean,day_min,day_max,purchase_date_ptp,purchase_date_min,purchase_date_max,month_sum,month_mean,month_nunique
0,C_ID_00007093c1,3,1,1,0,0.000000,0,5,1.666992,1,3,1.000000,1,1,0.333252,0,7,2.333984,0,0,0.000000,0,8,2.666016,4,15,5.000000,6,2,69,2,9,2,29,2,879,2,M_ID_00a6ca8a8a,2,35,...,0.000033,1.732422,18,6.0,6,6,0.0,0.0,31,10.335938,7,12,8.335938,-1.732422,1,0.000000,0,0,2,0.333252,0,1,2,14.335938,11,16,2,14.664062,14,15,2,7.000000,3,9,6 days 05:10:24,2018-04-03 11:13:35,2018-04-09 16:23:59,12,4.000000,1
1,C_ID_0001238066,28,1,1,2,0.071411,0,60,2.142578,1,34,1.213867,1,20,0.714355,1,48,1.713867,0,16,0.571289,1,50,1.786133,0,80,2.857422,1,8,314,4,9,9,37,15,278,25,M_ID_00a6ca8a8a,19,35,...,723.500000,4.128906,168,6.0,6,6,0.0,0.0,323,11.539062,7,12,1.813477,-3.070312,2,0.464355,0,1,6,4.070312,0,6,16,14.500000,0,23,9,12.820312,9,18,14,18.859375,1,30,60 days 03:09:03,2018-03-01 16:48:27,2018-04-30 19:57:30,93,3.322266,2
2,C_ID_0001506ef0,3,1,1,0,0.000000,0,9,3.000000,3,0,0.000000,0,1,0.333252,0,6,2.000000,0,0,0.000000,0,6,2.000000,0,15,5.000000,6,1,137,1,19,2,34,2,360,2,M_ID_00a6ca8a8a,2,35,...,0.000033,-1.732422,18,6.0,6,6,0.0,0.0,31,10.335938,7,12,8.335938,-1.732422,1,0.000000,0,0,2,3.666016,3,4,2,17.671875,9,22,2,11.335938,11,12,2,18.000000,16,22,5 days 10:52:32,2018-03-16 22:21:58,2018-03-22 09:14:30,9,3.000000,1
3,C_ID_0001793786,31,1,1,0,0.000000,0,63,2.033203,1,0,0.000000,0,27,0.871094,1,60,1.935547,3,9,0.290283,0,65,2.097656,3,71,2.291016,1,7,69,5,9,14,37,21,278,31,M_ID_0360f86430,21,35,...,516.500000,4.437500,186,6.0,6,6,0.0,0.0,366,11.804688,9,12,0.427979,-3.603516,2,0.451660,0,1,6,3.580078,0,6,10,11.421875,0,21,6,48.375000,46,52,13,23.609375,10,31,46 days 01:51:36,2017-11-15 15:44:20,2017-12-31 17:35:56,351,11.320312,2
4,C_ID_000183fdda,11,1,1,0,0.000000,0,33,3.000000,3,17,1.545898,1,11,1.000000,1,28,2.544922,2,0,0.000000,0,25,2.273438,3,33,3.000000,3,2,161,2,3,6,16,9,367,11,M_ID_113378fe3b,9,35,...,0.217529,1.808594,66,6.0,6,6,0.0,0.0,132,12.000000,12,12,0.000000,0.000000,2,0.181763,0,1,6,2.455078,0,5,8,15.453125,11,22,7,11.820312,9,18,9,11.726562,2,30,59 days 02:33:27,2018-03-02 12:26:26,2018-04-30 14:59:53,36,3.273438,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289996,C_ID_ffff1d9928,4,1,1,1,0.250000,0,18,4.500000,4,5,1.250000,1,3,0.750000,1,6,1.500000,2,3,0.750000,1,7,1.750000,2,18,4.500000,4,2,114,2,4,4,1,4,307,4,M_ID_2ad5c10ff1,4,35,...,0.088501,1.871094,24,6.0,6,6,0.0,0.0,48,12.000000,12,12,0.000000,0.000000,1,0.000000,0,0,2,2.000000,0,4,4,14.000000,7,21,2,15.000000,12,16,3,18.750000,16,20,31 days 13:27:55,2018-03-19 21:00:19,2018-04-20 10:28:14,15,3.750000,2
289997,C_ID_ffff579d3a,1,1,1,0,0.000000,0,1,1.000000,1,0,0.000000,0,1,1.000000,1,0,0.000000,0,0,0.000000,0,0,0.000000,0,1,1.000000,1,1,69,1,9,1,1,1,419,1,M_ID_69f80607d1,1,35,...,,,6,6.0,6,6,,,12,12.000000,12,12,,,1,0.000000,0,0,1,2.000000,2,2,1,8.000000,8,8,1,12.000000,12,12,1,21.000000,21,21,0 days 00:00:00,2018-03-21 08:55:34,2018-03-21 08:55:34,3,3.000000,1
289998,C_ID_ffff756266,1,1,1,0,0.000000,0,1,1.000000,1,3,3.000000,3,1,1.000000,1,3,3.000000,3,1,1.000000,1,3,3.000000,3,1,1.000000,1,1,233,1,9,1,21,1,411,1,M_ID_b6b4f36675,1,7426,...,,,6,6.0,6,6,,,12,12.000000,12,12,,,1,0.000000,0,0,1,1.000000,1,1,1,7.000000,7,7,1,15.000000,15,15,1,10.000000,10,10,0 days 00:00:00,2018-04-10 07:43:43,2018-04-10 07:43:43,4,4.000000,1
289999,C_ID_ffff828181,8,1,1,0,0.000000,0,27,3.375000,2,11,1.375000,1,6,0.750000,1,13,1.625000,1,5,0.625000,1,16,2.000000,2,30,3.750000,2,6,96,4,24,6,19,6,307,8,M_ID_0e9430c0b5,8,35,...,0.000448,1.817383,48,6.0,6,6,0.0,0.0,92,11.500000,8,12,2.000000,-2.828125,2,0.375000,0,1,5,4.000000,2,6,7,14.500000,7,19,5,14.375000,11,17,7,20.250000,5,29,45 days 23:10:47,2018-03-14 19:48:42,2018-04-29 18:59:29,29,3.625000,2


In [None]:
historical_transactions_df

Unnamed: 0,card_id,authorized_flag_sum,authorized_flag_mean,authorized_flag_mode,category_1_sum,category_1_mean,category_1_mode,category_2_sum,category_2_mean,category_2_mode,category_3_sum,category_3_mean,category_3_mode,category_1_merchants_t_sum,category_1_merchants_t_mean,category_1_merchants_t_mode,most_recent_sales_range_merchants_t_sum,most_recent_sales_range_merchants_t_mean,most_recent_sales_range_merchants_t_mode,category_4_merchants_t_sum,category_4_merchants_t_mean,category_4_merchants_t_mode,most_recent_purchases_range_merchants_t_sum,most_recent_purchases_range_merchants_t_mean,most_recent_purchases_range_merchants_t_mode,category_2_merchants_t_sum,category_2_merchants_t_mean,category_2_merchants_t_mode,city_id_nunique,city_id_mode,state_id_nunique,state_id_mode,subsector_id_nunique,subsector_id_mode,merchant_category_id_nunique,merchant_category_id_mode,merchant_id_nunique,merchant_id_mode,merchant_group_id_merchants_t_nunique,merchant_group_id_merchants_t_mode,...,numerical_1_merchants_t_var,numerical_1_merchants_t_skew,active_months_lag6_merchants_t_sum,active_months_lag6_merchants_t_mean,active_months_lag6_merchants_t_min,active_months_lag6_merchants_t_max,active_months_lag6_merchants_t_var,active_months_lag6_merchants_t_skew,active_months_lag12_merchants_t_sum,active_months_lag12_merchants_t_mean,active_months_lag12_merchants_t_min,active_months_lag12_merchants_t_max,active_months_lag12_merchants_t_var,active_months_lag12_merchants_t_skew,weekend_nunique,weekend_mean,weekend_min,weekend_max,weekday_nunique,weekday_mean,weekday_min,weekday_max,hour_nunique,hour_mean,hour_min,hour_max,weekofyear_nunique,weekofyear_mean,weekofyear_min,weekofyear_max,day_nunique,day_mean,day_min,day_max,purchase_date_ptp,purchase_date_min,purchase_date_max,month_sum,month_mean,month_nunique
0,C_ID_00007093c1,114,0.765137,1,28,0.187866,0,533,3.578125,3,173,1.161133,1,121,0.812012,1,268,1.798828,3,100,0.670898,1,281,1.885742,3,531,3.564453,3,4,244,3,2,13,19,18,307,29,M_ID_9400cf2342,14,60307,...,19.468750,2.517578,894,6.0,6,6,0.0,0.0,1776,11.921875,9,12,0.236694,-5.914062,2,0.167725,0,1,7,2.242188,0,6,18,14.414062,0,22,39,25.546875,1,52,28,13.250000,1,31,377 days 15:14:14,2017-02-14 14:00:43,2018-02-27 05:14:57,950,6.375000,12
1,C_ID_0001238066,120,0.975586,1,2,0.016266,0,243,1.975586,1,161,1.308594,1,118,0.959473,1,225,1.829102,2,103,0.837402,1,203,1.650391,2,220,1.789062,1,18,314,6,9,17,19,29,307,65,M_ID_d17aabd756,50,35,...,947.500000,3.273438,738,6.0,6,6,0.0,0.0,1476,12.000000,12,12,0.000000,0.000000,2,0.422852,0,1,7,3.503906,0,6,20,14.742188,0,23,23,29.968750,1,52,30,16.140625,1,30,151 days 17:53:45,2017-09-28 22:25:14,2018-02-27 16:18:59,899,7.308594,6
2,C_ID_0001506ef0,64,0.941406,1,0,0.000000,0,196,2.882812,3,1,0.014709,0,64,0.941406,1,90,1.323242,0,1,0.014709,0,89,1.308594,0,216,3.175781,3,3,137,2,19,12,33,19,705,28,M_ID_b1fc88154d,18,35,...,6.691406,3.732422,408,6.0,6,6,0.0,0.0,806,11.851562,7,12,0.724121,-5.695312,2,0.500000,0,1,7,3.617188,0,6,15,12.585938,0,21,24,27.734375,1,51,25,11.914062,1,31,398 days 20:17:55,2017-01-14 16:16:01,2018-02-17 12:33:56,473,6.957031,11
3,C_ID_0001793786,195,0.878418,1,2,0.009010,0,898,4.046875,6,5,0.022522,0,190,0.855957,1,443,1.995117,2,65,0.292725,0,455,2.048828,2,576,2.593750,2,10,179,4,-1,24,37,48,278,119,M_ID_923d57de8d,85,35,...,1.714844,3.111328,1332,6.0,6,6,0.0,0.0,2631,11.851562,7,12,0.679199,-5.640625,2,0.166626,0,1,7,2.662109,0,6,21,15.671875,0,23,33,27.171875,3,44,31,16.187500,1,31,283 days 10:04:57,2017-01-21 10:15:21,2017-10-31 20:20:18,1482,6.675781,10
4,C_ID_000183fdda,142,0.953125,1,4,0.026840,0,436,2.925781,3,194,1.301758,1,129,0.865723,1,284,1.906250,1,7,0.046967,0,308,2.066406,2,507,3.402344,3,9,161,7,3,21,16,36,367,73,M_ID_f9cfe0a43b,57,35,...,0.562500,3.970703,894,6.0,6,6,0.0,0.0,1760,11.812500,7,12,0.869629,-4.882812,2,0.234863,0,1,7,2.925781,0,6,19,16.531250,0,23,27,27.859375,1,52,30,13.937500,1,31,202 days 11:07:54,2017-08-07 09:49:14,2018-02-25 20:57:08,1025,6.878906,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_ffff1d9928,10,0.714355,1,3,0.214233,0,56,4.000000,4,18,1.286133,1,9,0.643066,1,34,2.427734,3,9,0.643066,1,30,2.142578,3,66,4.714844,4,3,114,3,4,5,33,5,705,8,M_ID_00a6ca8a8a,5,35,...,0.091736,3.654297,84,6.0,6,6,0.0,0.0,158,11.289062,7,12,3.296875,-2.294922,2,0.214233,0,1,6,2.285156,0,6,7,12.359375,0,17,8,28.281250,1,52,8,17.000000,5,31,89 days 14:32:08,2017-10-26 00:00:00,2018-01-23 14:32:08,96,6.855469,4
325536,C_ID_ffff579d3a,99,0.868652,1,0,0.000000,0,118,1.035156,1,3,0.026321,0,102,0.894531,1,239,2.095703,2,79,0.692871,1,214,1.876953,3,174,1.526367,1,7,70,2,9,15,37,27,80,62,M_ID_f0ada5d0c0,37,35,...,527.500000,4.984375,684,6.0,6,6,0.0,0.0,1340,11.757812,8,12,0.841797,-3.617188,2,0.192993,0,1,7,2.394531,0,6,19,15.132812,0,23,25,32.906250,2,52,30,16.531250,1,30,199 days 04:56:11,2017-08-08 12:33:43,2018-02-23 17:29:54,906,7.949219,7
325537,C_ID_ffff756266,14,0.583496,1,3,0.125000,0,39,1.625000,1,37,1.541992,2,19,0.791504,1,37,1.541992,2,19,0.791504,1,41,1.708008,2,49,2.041016,1,2,233,2,9,10,20,13,529,13,M_ID_0262bf62a0,7,35,...,0.328857,0.714844,144,6.0,6,6,0.0,0.0,288,12.000000,12,12,0.000000,0.000000,2,0.083313,0,1,5,2.750000,1,6,9,15.000000,7,23,9,22.203125,4,41,10,19.625000,2,30,260 days 20:41:13,2017-01-24 20:00:38,2017-10-12 16:41:51,130,5.417969,6
325538,C_ID_ffff828181,167,0.874512,1,11,0.057587,0,424,2.220703,2,215,1.125977,1,173,0.905762,1,388,2.031250,2,169,0.884766,1,397,2.078125,2,458,2.398438,2,11,96,7,24,24,19,44,307,89,M_ID_181cf8c379,60,35,...,2.947266,4.113281,1146,6.0,6,6,0.0,0.0,2284,11.960938,7,12,0.177124,-10.539062,2,0.230347,0,1,7,2.910156,0,6,20,13.914062,0,23,38,26.578125,1,52,31,17.765625,1,31,410 days 17:26:26,2017-01-05 15:27:33,2018-02-20 08:53:59,1238,6.480469,12


In [None]:
325540 - 290001

In [None]:
!ls

 augmented_test.csv			  new_merchant_df.pkl
 augmented_train.csv			  new_merchant_transactions.csv
'Data Dictionary.xlsx'			  sample_submission.csv
 Data_Dictionary.xlsx			  test.csv
 historical_transactions.csv		  train.csv
 historical_transactions_df.pkl		  transactions_raw_merged.csv
 merchants.csv				  transactions_refined_1_merged_WON.csv
 merged_transactions_with_merchants.csv


In [None]:
train_df = pd.read_csv("train.csv")

In [None]:
train_df.card_id

0         C_ID_92a2005557
1         C_ID_3d0044924f
2         C_ID_d639edf6cd
3         C_ID_186d6a6901
4         C_ID_cdbd2c0db2
               ...       
201912    C_ID_963962de2c
201913    C_ID_1314773c0b
201914    C_ID_7666735b3d
201915    C_ID_73f5a0efd0
201916    C_ID_92c9984c58
Name: card_id, Length: 201917, dtype: object

In [None]:
with timer("Train test"):
  df = train_test(None).reset_index()
  print(df)
with timer("merge hist and t t "):
  df = pd.merge(df, historical_transactions_df, on='card_id', how='outer')
  print(df)
with timer("merge new mer and df "):
  df = pd.merge(df, new_merchant_df, on='card_id', how='outer')
  print(df)
with timer("split train & test"):
  train_df = df[df['target'].notnull()]
  test_df = df[df['target'].isnull()]
  del test_df['target']
  del df
  gc.collect()
with timer("Save train and test files"):
  train_df.to_csv('augmented_train.csv', index=False)
  test_df.to_csv('augmented_test.csv', index=False)

Train samples: 201917, test samples: 123623
The columns on which one hot encoding is performed is  []
                card_id first_active_month  ...  feature_min  feature_std
0       C_ID_92a2005557         2017-06-01  ...          1.0          0.0
1       C_ID_3d0044924f         2017-01-01  ...          1.0          0.0
2       C_ID_d639edf6cd         2016-08-01  ...          1.0          0.0
3       C_ID_186d6a6901         2017-09-01  ...          1.0          0.0
4       C_ID_cdbd2c0db2         2017-11-01  ...          1.0          0.0
...                 ...                ...  ...          ...          ...
325535  C_ID_7a239d2eda         2017-10-01  ...          1.0          0.0
325536  C_ID_75ace375ae         2017-09-01  ...          1.0          0.0
325537  C_ID_21d56d950c         2016-09-01  ...          1.0          0.0
325538  C_ID_6c46fc5a9d         2017-06-01  ...          1.0          0.0
325539  C_ID_87e7979a5f         2016-10-01  ...          1.0          0.0

[325540 r

## Load the dataset 

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/data/augmented_train.csv')

In [None]:
train_df

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,target,outliers,quarter,elapsed_time,quarter_first_active_month,first_active_month_diff_from_today,feature_sum,feature_mean,feature_max,feature_min,feature_std,authorized_flag_sum_x,authorized_flag_mean_x,authorized_flag_mode_x,category_1_sum_x,category_1_mean_x,category_1_mode_x,category_2_sum_x,category_2_mean_x,category_2_mode_x,category_3_sum_x,category_3_mean_x,category_3_mode_x,category_1_merchants_t_sum_x,category_1_merchants_t_mean_x,category_1_merchants_t_mode_x,most_recent_sales_range_merchants_t_sum_x,most_recent_sales_range_merchants_t_mean_x,most_recent_sales_range_merchants_t_mode_x,category_4_merchants_t_sum_x,category_4_merchants_t_mean_x,category_4_merchants_t_mode_x,most_recent_purchases_range_merchants_t_sum_x,most_recent_purchases_range_merchants_t_mean_x,most_recent_purchases_range_merchants_t_mode_x,...,numerical_1_merchants_t_var_y,numerical_1_merchants_t_skew_y,active_months_lag6_merchants_t_sum_y,active_months_lag6_merchants_t_mean_y,active_months_lag6_merchants_t_min_y,active_months_lag6_merchants_t_max_y,active_months_lag6_merchants_t_var_y,active_months_lag6_merchants_t_skew_y,active_months_lag12_merchants_t_sum_y,active_months_lag12_merchants_t_mean_y,active_months_lag12_merchants_t_min_y,active_months_lag12_merchants_t_max_y,active_months_lag12_merchants_t_var_y,active_months_lag12_merchants_t_skew_y,weekend_nunique_y,weekend_mean_y,weekend_min_y,weekend_max_y,weekday_nunique_y,weekday_mean_y,weekday_min_y,weekday_max_y,hour_nunique_y,hour_mean_y,hour_min_y,hour_max_y,weekofyear_nunique_y,weekofyear_mean_y,weekofyear_min_y,weekofyear_max_y,day_nunique_y,day_mean_y,day_min_y,day_max_y,purchase_date_ptp_y,purchase_date_min_y,purchase_date_max_y,month_sum_y,month_mean_y,month_nunique_y
0,C_ID_92a2005557,2017-06-01 00:00:00,1.0,1.0,1.0,-0.820283,1.0,2.0,1303.0,2.0,1303.0,3.0,1.0,1.0,1.0,0.0,257,0.9480,1,0,0.0000,0,283,1.0440,1,4,0.01476,0,235,0.8670,1,451,1.6640,0,12,0.04428,0,390,1.4390,0,...,3156.000000,1.922,137.0,5.957,5.0,6.0,0.04350,-4.797,267.0,11.610,5.0,12.0,2.248,-4.305,2.0,0.2610,0.0,1.0,7.0,3.130,0.0,6.0,8.0,12.870,8.0,16.0,7.0,13.305,10.0,17.0,17.0,16.440,5.0,31.0,54 days 21:18:29,2018-03-05 14:04:36,2018-04-29 11:23:05,80.0,3.479,2.0
1,C_ID_3d0044924f,2017-01-01 00:00:00,1.0,1.0,1.0,0.392913,1.0,1.0,1454.0,1.0,1454.0,3.0,1.0,1.0,1.0,0.0,354,0.9697,1,35,0.0959,0,540,1.4795,1,448,1.22800,1,303,0.8300,1,392,1.0740,0,29,0.07947,0,389,1.0650,0,...,1.947000,2.445,36.0,6.000,6.0,6.0,0.00000,0.000,72.0,12.000,12.0,12.0,0.000,0.000,1.0,0.0000,0.0,0.0,4.0,1.500,0.0,4.0,5.0,11.164,6.0,17.0,4.0,9.000,5.0,13.0,4.0,13.500,1.0,30.0,56 days 13:40:32,2018-02-01 17:07:54,2018-03-30 06:48:26,15.0,2.500,2.0
2,C_ID_d639edf6cd,2016-08-01 00:00:00,1.0,1.0,1.0,0.688056,1.0,3.0,1607.0,3.0,1607.0,3.0,1.0,1.0,1.0,0.0,42,0.9546,1,0,0.0000,0,200,4.5470,5,0,0.00000,0,42,0.9546,1,16,0.3635,0,3,0.06820,0,16,0.3635,0,...,,,6.0,6.000,6.0,6.0,,,10.0,10.000,10.0,10.0,,,1.0,1.0000,1.0,1.0,1.0,5.000,5.0,5.0,1.0,17.000,17.0,17.0,1.0,17.000,17.0,17.0,1.0,28.000,28.0,28.0,0 days 00:00:00,2018-04-28 17:43:11,2018-04-28 17:43:11,4.0,4.000,1.0
3,C_ID_186d6a6901,2017-09-01 00:00:00,1.0,1.0,1.0,0.142495,1.0,3.0,1211.0,3.0,1211.0,3.0,1.0,1.0,1.0,0.0,89,1.0000,1,13,0.1461,0,322,3.6170,4,101,1.13500,1,59,0.6630,1,143,1.6060,0,2,0.02248,0,142,1.5960,0,...,0.112500,1.933,42.0,6.000,6.0,6.0,0.00000,0.000,84.0,12.000,12.0,12.0,0.000,0.000,2.0,0.4285,0.0,1.0,4.0,3.285,1.0,6.0,5.0,13.000,7.0,21.0,5.0,13.860,10.0,16.0,7.0,13.140,4.0,24.0,41 days 23:05:05,2018-03-07 11:55:06,2018-04-18 11:00:11,26.0,3.715,2.0
4,C_ID_cdbd2c0db2,2017-11-01 00:00:00,1.0,1.0,1.0,-0.159749,1.0,4.0,1150.0,4.0,1150.0,3.0,1.0,1.0,1.0,0.0,138,0.9650,1,16,0.1119,0,554,3.8750,4,150,1.04900,1,107,0.7480,1,275,1.9230,3,7,0.04895,0,278,1.9440,3,...,21.580000,4.210,215.0,5.973,5.0,6.0,0.02777,-6.000,425.0,11.805,5.0,12.0,1.361,-6.000,2.0,0.3333,0.0,1.0,7.0,3.277,0.0,6.0,14.0,14.720,5.0,23.0,8.0,13.360,9.0,17.0,22.0,14.586,1.0,31.0,57 days 06:54:42,2018-03-02 11:55:43,2018-04-28 18:50:25,128.0,3.555,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2017-09-01 00:00:00,1.0,1.0,1.0,-2.740821,1.0,3.0,1211.0,3.0,1211.0,3.0,1.0,1.0,1.0,0.0,53,0.8984,1,0,0.0000,0,123,2.0840,3,2,0.03390,0,32,0.5425,1,122,2.0680,2,17,0.28800,0,117,1.9830,2,...,,,6.0,6.000,6.0,6.0,,,12.0,12.000,12.0,12.0,,,1.0,1.0000,1.0,1.0,1.0,6.000,6.0,6.0,1.0,5.000,5.0,5.0,1.0,10.000,10.0,10.0,1.0,11.000,11.0,11.0,0 days 00:00:00,2018-03-11 05:42:27,2018-03-11 05:42:27,3.0,3.000,1.0
201913,C_ID_1314773c0b,2015-10-01 00:00:00,1.0,1.0,1.0,0.312917,1.0,4.0,1912.0,4.0,1912.0,3.0,1.0,1.0,1.0,0.0,40,0.8510,1,0,0.0000,0,47,1.0000,1,1,0.02127,0,46,0.9785,1,99,2.1050,1,46,0.97850,1,91,1.9370,1,...,,,6.0,6.000,6.0,6.0,,,12.0,12.000,12.0,12.0,,,1.0,0.0000,0.0,0.0,1.0,1.000,1.0,1.0,1.0,13.000,13.0,13.0,1.0,12.000,12.0,12.0,1.0,20.000,20.0,20.0,0 days 00:00:00,2018-03-20 13:17:07,2018-03-20 13:17:07,3.0,3.000,1.0
201914,C_ID_7666735b3d,2017-08-01 00:00:00,1.0,1.0,1.0,0.093494,1.0,3.0,1242.0,3.0,1242.0,3.0,1.0,1.0,1.0,0.0,83,0.9326,1,8,0.0899,0,301,3.3830,3,101,1.13500,1,65,0.7305,1,145,1.6290,1,23,0.25850,0,150,1.6860,2,...,0.000525,1.732,18.0,6.000,6.0,6.0,0.00000,0.000,36.0,12.000,12.0,12.0,0.000,0.000,1.0,0.0000,0.0,0.0,3.0,1.667,0.0,3.0,3.0,14.000,10.0,21.0,2.0,12.664,12.0,13.0,3.0,25.330,22.0,28.0,5 days 14:20:58,2018-03-22 21:23:21,2018-03-28 11:44:19,9.0,3.000,1.0
201915,C_ID_73f5a0efd0,2016-07-01 00:00:00,1.0,1.0,1.0,-4.676589,1.0,3.0,1638.0,3.0,1638.0,3.0,1.0,1.0,1.0,0.0,25,0.8930,1,0,0.0000,0,28,1.0000,1,0,0.00000,0,24,0.8570,1,27,0.9644,0,1,0.03570,0,26,0.9287,0,...,324.000000,2.230,30.0,6.000,6.0,6.0,0.00000,0.000,60.0,12.000,12.0,12.0,0.000,0.000,1.0,0.0000,0.0,0.0,3.0,2.000,1.0,4.0,4.0,14.200,11.0,20.0,5.0,48.200,45.0,52.0,5.0,12.400,1.0,27.0,49 days 18:59:03,2017-11-07 16:20:46,2017-12-27 11:19:49,58.0,11.600,2.0


In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/data/augmented_test.csv')

In [None]:
test_df

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,outliers,quarter,elapsed_time,quarter_first_active_month,first_active_month_diff_from_today,feature_sum,feature_mean,feature_max,feature_min,feature_std,authorized_flag_sum_x,authorized_flag_mean_x,authorized_flag_mode_x,category_1_sum_x,category_1_mean_x,category_1_mode_x,category_2_sum_x,category_2_mean_x,category_2_mode_x,category_3_sum_x,category_3_mean_x,category_3_mode_x,category_1_merchants_t_sum_x,category_1_merchants_t_mean_x,category_1_merchants_t_mode_x,most_recent_sales_range_merchants_t_sum_x,most_recent_sales_range_merchants_t_mean_x,most_recent_sales_range_merchants_t_mode_x,category_4_merchants_t_sum_x,category_4_merchants_t_mean_x,category_4_merchants_t_mode_x,most_recent_purchases_range_merchants_t_sum_x,most_recent_purchases_range_merchants_t_mean_x,most_recent_purchases_range_merchants_t_mode_x,category_2_merchants_t_sum_x,...,numerical_1_merchants_t_var_y,numerical_1_merchants_t_skew_y,active_months_lag6_merchants_t_sum_y,active_months_lag6_merchants_t_mean_y,active_months_lag6_merchants_t_min_y,active_months_lag6_merchants_t_max_y,active_months_lag6_merchants_t_var_y,active_months_lag6_merchants_t_skew_y,active_months_lag12_merchants_t_sum_y,active_months_lag12_merchants_t_mean_y,active_months_lag12_merchants_t_min_y,active_months_lag12_merchants_t_max_y,active_months_lag12_merchants_t_var_y,active_months_lag12_merchants_t_skew_y,weekend_nunique_y,weekend_mean_y,weekend_min_y,weekend_max_y,weekday_nunique_y,weekday_mean_y,weekday_min_y,weekday_max_y,hour_nunique_y,hour_mean_y,hour_min_y,hour_max_y,weekofyear_nunique_y,weekofyear_mean_y,weekofyear_min_y,weekofyear_max_y,day_nunique_y,day_mean_y,day_min_y,day_max_y,purchase_date_ptp_y,purchase_date_min_y,purchase_date_max_y,month_sum_y,month_mean_y,month_nunique_y
0,C_ID_0ab67a22ab,2017-04-01 00:00:00,1.0,1.0,1.0,,2.0,1364.0,2.0,1364.0,3.0,1.0,1.0,1.0,0.0,47,0.6620,1,23,0.32400,0,186,2.620,1,93,1.310,1,42,0.5913,1,116,1.6340,0,36,0.50700,1,118,1.662,0,216,...,0.000131,1.732,18.0,6.0,6.0,6.0,0.0,0.0,36.0,12.000,12.0,12.0,0.000,0.000,2.0,0.3333,0.0,1.0,3.0,3.666,2.0,5.0,3.0,13.664,9.0,19.0,3.0,7.00,5.0,9.0,3.0,15.664,3.0,28.0,25 days 09:24:23,2018-02-03 09:44:29,2018-02-28 19:08:52,6.0,2.000,1.0
1,C_ID_130fd0cbdd,2017-01-01 00:00:00,1.0,1.0,1.0,,1.0,1454.0,1.0,1454.0,3.0,1.0,1.0,1.0,0.0,77,0.9873,1,2,0.02563,0,315,4.040,4,80,1.025,1,76,0.9740,1,130,1.6670,2,75,0.96140,1,127,1.628,2,316,...,1.010000,3.162,60.0,6.0,6.0,6.0,0.0,0.0,111.0,11.100,7.0,12.0,3.656,-1.846,2.0,0.3000,0.0,1.0,6.0,2.700,0.0,6.0,8.0,15.700,9.0,23.0,6.0,12.30,9.0,16.0,7.0,11.400,3.0,20.0,48 days 05:41:29,2018-03-03 12:18:48,2018-04-20 18:00:17,34.0,3.400,2.0
2,C_ID_b709037bc5,2017-08-01 00:00:00,1.0,1.0,1.0,,3.0,1242.0,3.0,1242.0,3.0,1.0,1.0,1.0,0.0,9,0.6924,1,1,0.07690,0,62,4.770,5,28,2.154,2,12,0.9230,1,25,1.9230,2,2,0.15380,0,35,2.691,3,66,...,0.053530,,12.0,6.0,6.0,6.0,0.0,,24.0,12.000,12.0,12.0,0.000,,1.0,0.0000,0.0,0.0,2.0,2.000,1.0,3.0,2.0,13.500,13.0,14.0,2.0,10.00,9.0,11.0,2.0,7.000,1.0,13.0,11 days 22:19:13,2018-03-01 14:51:33,2018-03-13 13:10:46,6.0,3.000,1.0
3,C_ID_d27d835a9f,2017-12-01 00:00:00,1.0,1.0,1.0,,4.0,1120.0,4.0,1120.0,3.0,1.0,1.0,1.0,0.0,26,1.0000,1,0,0.00000,0,26,1.000,1,37,1.423,1,26,1.0000,1,35,1.3460,2,0,0.00000,0,36,1.385,0,26,...,0.458500,2.053,60.0,6.0,6.0,6.0,0.0,0.0,120.0,12.000,12.0,12.0,0.000,0.000,2.0,0.3000,0.0,1.0,5.0,3.200,1.0,6.0,7.0,18.200,11.0,21.0,7.0,12.10,9.0,16.0,8.0,13.600,4.0,31.0,44 days 10:15:54,2018-03-04 11:06:29,2018-04-17 21:22:23,33.0,3.300,2.0
4,C_ID_2b5e3df5c2,2015-12-01 00:00:00,1.0,1.0,1.0,,4.0,1851.0,4.0,1851.0,3.0,1.0,1.0,1.0,0.0,90,0.7964,1,0,0.00000,0,426,3.770,4,120,1.062,1,107,0.9470,1,252,2.2300,3,7,0.06195,0,257,2.273,3,461,...,1.551000,2.180,36.0,6.0,6.0,6.0,0.0,0.0,67.0,11.164,8.0,12.0,2.566,-2.148,2.0,0.3333,0.0,1.0,4.0,4.000,2.0,6.0,5.0,8.000,0.0,16.0,3.0,10.50,9.0,15.0,5.0,7.332,4.0,12.0,39 days 09:11:34,2018-03-04 03:01:37,2018-04-12 12:13:11,19.0,3.166,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123618,C_ID_7a239d2eda,2017-10-01 00:00:00,1.0,1.0,1.0,,4.0,1181.0,4.0,1181.0,3.0,1.0,1.0,1.0,0.0,77,0.9624,1,0,0.00000,0,450,5.625,6,0,0.000,0,68,0.8500,1,145,1.8125,1,68,0.85000,1,143,1.787,1,480,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123619,C_ID_75ace375ae,2017-09-01 00:00:00,1.0,1.0,1.0,,3.0,1211.0,3.0,1211.0,3.0,1.0,1.0,1.0,0.0,9,1.0000,1,0,0.00000,0,13,1.444,1,0,0.000,0,4,0.4443,0,17,1.8890,0,4,0.44430,0,16,1.777,2,34,...,0.363500,1.911,24.0,6.0,6.0,6.0,0.0,0.0,48.0,12.000,12.0,12.0,0.000,0.000,2.0,0.7500,0.0,1.0,3.0,4.500,2.0,6.0,2.0,15.750,9.0,18.0,2.0,10.75,9.0,16.0,3.0,7.000,3.0,18.0,45 days 15:54:17,2018-03-03 18:04:31,2018-04-18 09:58:48,13.0,3.250,2.0
123620,C_ID_21d56d950c,2016-09-01 00:00:00,1.0,1.0,1.0,,3.0,1576.0,3.0,1576.0,3.0,1.0,1.0,1.0,0.0,36,0.9730,1,3,0.08105,0,184,4.973,5,50,1.352,1,32,0.8647,1,61,1.6480,0,3,0.08105,0,62,1.676,0,190,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
123621,C_ID_6c46fc5a9d,2017-06-01 00:00:00,1.0,1.0,1.0,,2.0,1303.0,2.0,1303.0,3.0,1.0,1.0,1.0,0.0,46,0.7075,1,0,0.00000,0,181,2.785,3,99,1.523,2,59,0.9077,1,117,1.8000,1,54,0.83060,1,107,1.646,1,213,...,28.940000,1.958,36.0,6.0,6.0,6.0,0.0,0.0,72.0,12.000,12.0,12.0,0.000,0.000,2.0,0.3333,0.0,1.0,5.0,3.166,0.0,6.0,4.0,8.000,0.0,18.0,3.0,45.16,44.0,47.0,5.0,10.336,1.0,26.0,25 days 00:00:00,2017-11-01 00:00:00,2017-11-26 00:00:00,66.0,11.000,1.0


In [None]:
test = pd.read_csv("test.csv")

In [None]:
test

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1
...,...,...,...,...,...
123618,2017-10,C_ID_7a239d2eda,2,2,0
123619,2017-09,C_ID_75ace375ae,3,1,1
123620,2016-09,C_ID_21d56d950c,5,1,1
123621,2017-06,C_ID_6c46fc5a9d,2,1,0


## Start Exploring 

In [None]:
train_df.isnull().sum(axis = 0)

card_id                    0
first_active_month         0
feature_1                  0
feature_2                  0
feature_3                  0
                       ...  
purchase_date_min_y    21931
purchase_date_max_y    21931
month_sum_y            21931
month_mean_y           21931
month_nunique_y        21931
Length: 330, dtype: int64

### Checking for percentage of null values 

In [None]:
#Gives the name of all the variables with missing data 

columns_with_na = [var for var in train_df.columns if train_df[var].isnull().mean()  > 0]

In [None]:
len(columns_with_na)

168

In [None]:
#lets find out the percentage of observations missing per variable

#calculate the percentage of missing 
data_na = train_df[columns_with_na].isnull().mean()

#transform the array to dataframe 
data_na = pd.DataFrame(data_na.reset_index())

#add names to the dataframe 
data_na.columns = ['col','percentage_na']

#oreder the dataframe acc to percentage 
data_na.sort_values(by = 'percentage_na',ascending = False, inplace = True)

#show
data_na

Unnamed: 0,col,percentage_na
87,avg_sales_lag3_merchants_t_skew_y,0.344117
117,avg_sales_lag12_merchants_t_skew_y,0.344117
105,avg_sales_lag6_merchants_t_skew_y,0.344117
123,avg_purchases_lag12_merchants_t_skew_y,0.344052
141,active_months_lag12_merchants_t_skew_y,0.344052
...,...,...
9,active_months_lag6_merchants_t_skew_x,0.000168
7,avg_purchases_lag12_merchants_t_skew_x,0.000168
8,numerical_1_merchants_t_skew_x,0.000168
10,active_months_lag12_merchants_t_skew_x,0.000168


In [None]:
print(data_na[data_na.percentage_na>0.1])

                                         col  percentage_na
87         avg_sales_lag3_merchants_t_skew_y       0.344117
117       avg_sales_lag12_merchants_t_skew_y       0.344117
105        avg_sales_lag6_merchants_t_skew_y       0.344117
123   avg_purchases_lag12_merchants_t_skew_y       0.344052
141   active_months_lag12_merchants_t_skew_y       0.344052
..                                       ...            ...
54        subsector_id_merchants_t_nunique_y       0.108614
55           subsector_id_merchants_t_mode_y       0.108614
56             city_id_merchants_t_nunique_y       0.108614
57                city_id_merchants_t_mode_y       0.108614
53   merchant_category_id_merchants_t_mode_y       0.108614

[157 rows x 2 columns]
