<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/Preprocessing%20and%20Feature%20Engineering/mean_median_based_imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [None]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode

### Mount the drive 

In [None]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Setup the current path 

In [None]:
!ls

drive  sample_data


In [None]:
#Setting up worksapce directory 
%cd /content/drive/MyDrive/data 

/content/drive/MyDrive/data


In [None]:
!ls

 augmented_test.csv		  LR_cat_2.sav
 augmented_test_try1.csv	  mer_authorized_flag_enc.npy
 augmented_train.csv		  mer_category_1_enc.npy
 augmented_train_try1.csv	  merchant_category_1_enc.npy
 clf_cat2.sav			  merchant_category_4_enc.npy
 clf_cat3.sav			  merchant_most_recent_purchases_range_enc.npy
 clf_name.sav			  merchant_most_recent_sales_range_enc.npy
'Data Dictionary.xlsx'		  merchants.csv
 Data_Dictionary.xlsx		  merged_transactions_with_merchants.csv
 file_name.csv			  new_merchant_authorized_flag_enc.npy
 hist_authorized_flag_enc.npy	  new_merchant_category_1_enc.npy
 hist_category_1_enc.npy	  new_merchant_df.pkl
 hist_category_3_enc.npy	  new_merchant_transactions.csv
 historical_transactions.csv	  sample_submission.csv
 historical_transactions_df.pkl   test.csv
 knn_sal_12.sav			  train.csv
 knn_sal_3.sav			  transactions_raw_merged.csv
 knn_sal_6.sav			  transactions_refined_1_merged_WON.csv


## Timer 

In [None]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI
@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

## Memory Reduction 

In [None]:
#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## FE on Train Test

In [None]:
def train_test(num_rows=None):

    """
   FUNCTION:
        MERGE THE TRAIN AND TEST AND DOES SOME BASIC FEATURE ENGINEERING 

   ARGS:
        NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
        DEFAULT IS NONE
  
   RETURNS:
        THE MERGED DATAFRAME OF TRAIN AND TEST ALONG WITH BASIC FEATURE ENGINEERING 

    """

    # load csv
    train_df = pd.read_csv('/content/drive/MyDrive/data/train.csv', index_col=['card_id'], nrows=num_rows)
    test_df = pd.read_csv('/content/drive/MyDrive/data/test.csv', index_col=['card_id'], nrows=num_rows)

    print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))
    OUTLIER_THRESHOLD = 30
    # Create an outliers column set to 1 for
    train_df['outliers'] = np.where(train_df['target'] < OUTLIER_THRESHOLD, 1, 0)

    # set target as nan
    test_df['target'] = np.nan

    # merge
    df = train_df.append(test_df)

    del train_df, test_df
    gc.collect()

    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['quarter_first_active_month'] = df['first_active_month'].dt.quarter
    df['first_active_month_diff_from_today'] = (datetime.datetime.today() - df['first_active_month']).dt.days

    for col in ['feature_1', 'feature_2', 'feature_3']:
        order_label = df.groupby(col)['outliers'].mean()
        df[col] = df[col].map(order_label)

    # Some basic statistics transformations over the feature_i columns
    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum'] / 3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_std'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)

    return df

## Imputations on transactions data 

In [None]:
def transactions_imputations(df_name='hist',nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
          DF:- THE DATAFRAME THAT NEEDS TO HAVE IMPUTATIONS 
          DEFAULT IS HISTORICAL TRANSACTIONS 
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 
    """

    if df_name == 'hist':
      df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=nrows)
    else:
      df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=nrows)
    

    # fillna
    df['category_2'].fillna(6, inplace=True)
    df['category_3'].fillna(3, inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
    df['installments'].replace(-1, np.nan, inplace=True)
    df['installments'].replace(999, np.nan, inplace=True)
    df['installments'].fillna(df['installments'].mode()[0], inplace=True)



    # mapping categorical to numerical 
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2, 3:3}).astype(int)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)
    #removing purchase amount outliner    
    df['purchase_amount'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    df['price'] = df['purchase_amount'] / (df['installments'] + 0.001) #some epsilone for 0  installments 

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df


## Imputations on Merchant Data

In [None]:
def merchant_imputations(num_rows=None):
    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL
          IT HANDLES THE INF VALUES  

    ARGS:
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE
    
    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 
    """

    df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)

    #drop duplicate merchant id
    df.drop_duplicates(subset=['merchant_id'], keep='last')

    df['category_1'] = df['category_1'].fillna(2)
    df['category_1'] = df['category_1'].map({'Y': 0, 'N': 1,2:2}).astype(int)

    #for most_recent_sales_range_merchants_t_merchants_t and most_recent_purchases_range_merchants_t_merchants_t
    #it has A,B,C,D,E and null values 
    df['most_recent_sales_range'] = df['most_recent_sales_range'].fillna(5)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].fillna(5)
    df['most_recent_sales_range'] = df['most_recent_sales_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    

    #for category_4_merchants_t_merchants_t
    #it has Y,N and null values 
    df['category_4'] = df['category_4'].fillna(2)
    df['category_4'] = df['category_4'].map({'Y': 0, 'N': 1, 2 : 2}).astype(int)

    df['category_2'] = df['category_2'].fillna(6)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)

    #for missing id we will use -1111 as filling value 
    #find the number of missing values 
    for col in df[['merchant_group_id','state_id',
                            'merchant_category_id','subsector_id','city_id']]:
                            df[col] = df[col].fillna(-1111)

    #imputing the inf values with max 
    # "avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"
    features_inf = ["avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"]
    for col in features_inf:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])
    
    # imputing the missing values with model 
    #'avg_sales_lag3','avg_sales_lag6','avg_sales_lag12'
    features_missing = ['avg_sales_lag3','avg_sales_lag6','avg_sales_lag12']
    for col in features_missing:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])
        df[col].fillna((df[col].mean()), inplace=True)

    df.columns = [col+"_merchants_t" if col!="merchant_id" else col for col in df.columns]

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

## Basic FE and Grouping on card id 

In [None]:
def group_on_card_id_withMer1(df):
    """
    FUNCTION:
            TO GROUP THE DATA ON CARD ID 
            BASIC DATE AND TIME FEATURE ENGINEERING  
            THIS INCLUDES MERCHANTS DATA TOO

    ARGS:
            DF:- THE DATAFRAME ON WHICH THE BASIC FEATURE ENGINEERING AND 
                  GROUPING ON CARD ID IS DONE 

    RETURNS:
            THE MODIFIED DATA FRAME 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']


      
    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    agg_fun = {
      
    #CATEGORICAL VALUES  
        
    "authorized_flag": ['sum', 'mean',mode],
    'category_1' : ['sum', 'mean',mode],
    'category_2' :['sum', 'mean',mode],
    'category_3' :['sum', 'mean',mode],
    'category_1_merchants_t' :['sum', 'mean',mode],
    'most_recent_sales_range_merchants_t' :['sum', 'mean',mode],
    'category_4_merchants_t' :['sum', 'mean',mode],
    'most_recent_purchases_range_merchants_t' :['sum', 'mean',mode],
    'category_2_merchants_t' :['sum', 'mean',mode],


    # ID VALUES 
    'city_id' : ['nunique',mode],
    'state_id' : ['nunique',mode],
    'subsector_id' : ['nunique',mode],
    'merchant_category_id' : ['nunique',mode],
    'merchant_id': ['nunique',mode],
    'merchant_group_id_merchants_t':['nunique',mode],
    'state_id_merchants_t':['nunique',mode],
    'merchant_category_id_merchants_t':['nunique',mode],
    'subsector_id_merchants_t':['nunique',mode],
    'city_id_merchants_t':['nunique',mode],


    #NUMERICAL 
    'month_lag' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'installments' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'purchase_amount' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'numerical_2_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_sales_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'avg_purchases_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'numerical_1_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],

    #DATE TIME 
    'weekend': ['nunique', 'mean', 'min', 'max'],
    'weekday' : ['nunique', 'mean', 'min', 'max'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'mean', 'min', 'max'],
    #Refer:-https://numpy.org/doc/stable/reference/generated/numpy.ptp.html
    'purchase_date' : [np.ptp, 'min', 'max'],
    'month' : ['sum', 'mean', 'nunique']

    }

    df = df.groupby("card_id",as_index=False).agg(agg_fun)
    print(df.card_id)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

In [None]:
def group_on_card_id_withOUTMer1(df):
    """
    FUNCTION:
            TO GROUP THE DATA ON CARD ID 
            BASIC DATE AND TIME FEATURE ENGINEERING  

    ARGS:
            DF:- THE DATAFRAME ON WHICH THE BASIC FEATURE ENGINEERING AND 
                  GROUPING ON CARD ID IS DONE 

    RETURNS:
            THE MODIFIED DATA FRAME 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']


      
    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    agg_fun = {
      
    #CATEGORICAL VALUES  
        
    "authorized_flag": ['sum', 'mean',mode],
    'category_1' : ['sum', 'mean',mode],
    'category_2' :['sum', 'mean',mode],
    'category_3' :['sum', 'mean',mode],
    #'category_1_merchants_t' :['sum', 'mean',mode],
    #'most_recent_sales_range_merchants_t' :['sum', 'mean',mode],
    #'category_4_merchants_t' :['sum', 'mean',mode],
    #'most_recent_purchases_range_merchants_t' :['sum', 'mean',mode],
    #'category_2_merchants_t' :['sum', 'mean',mode],


    # ID VALUES 
    'city_id' : ['nunique',mode],
    'state_id' : ['nunique',mode],
    'subsector_id' : ['nunique',mode],
    'merchant_category_id' : ['nunique',mode],
    'merchant_id': ['nunique',mode],
    #'merchant_group_id_merchants_t':['nunique',mode],
    #'state_id_merchants_t':['nunique',mode],
    #'merchant_category_id_merchants_t':['nunique',mode],
    #'subsector_id_merchants_t':['nunique',mode],
    #'city_id_merchants_t':['nunique',mode],


    #NUMERICAL 
    'month_lag' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'installments' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'purchase_amount' : ['sum', 'mean', 'min', 'max', 'std','var'],
    #'numerical_2_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_sales_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_purchases_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'active_months_lag3_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_sales_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_purchases_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_sales_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'avg_purchases_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'numerical_1_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'active_months_lag6_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],
    #'active_months_lag12_merchants_t': ['sum', 'mean', 'min', 'max', 'var', 'skew'],

    #DATE TIME 
    'weekend': ['nunique', 'mean', 'min', 'max'],
    'weekday' : ['nunique', 'mean', 'min', 'max'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'mean', 'min', 'max'],
    #Refer:-https://numpy.org/doc/stable/reference/generated/numpy.ptp.html
    'purchase_date' : [np.ptp, 'min', 'max'],
    'month' : ['sum', 'mean', 'nunique']

    }

    df = df.groupby("card_id",as_index=False).agg(agg_fun)
    print(df.card_id)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

## Generating the aurgumentated dataset 

In [None]:
def imputations_merAndtrans(debug=False):

    """ 
    FUNCTION:
      Do model based imputation 
      STEPS:-
      1. Load and Impute Historical Transactions
      2. Load and Impute New Merchants Transactions
      3. Load and Impute  Merchants   

    ARGS:
     Debug:- Takes care of numer of rows as we need faster manipulation of data in dubigging 

    RETURNS:
      Generates the new_merchant_df, historical_transactions_df, merchant_df 

    """

    num_rows = 10000 if debug else None
    with timer("load and impute transactions ...."):
        # load AND IMPUTE 
        print("[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....")
        historical_transactions_df = transactions_imputations(df_name='hist',nrows = num_rows)
        print("[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....")
        new_merchant_df = transactions_imputations(df_name='mer',nrows = num_rows)

        #unique values of column of card id 
        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())
        print("[INFO] Check for Nan")
        print(historical_transactions_df.isna().sum())
        print(historical_transactions_df.isna().sum())

    with timer("load and impute merchants ...."):
        #load csv 
        print("[INFO] LOADING MERCHANTS AND IMPUTATIONS  ....")
        merchant_df = merchant_imputations(num_rows=num_rows)

        print("[INFO] Check for Nan ...")
        print(merchant_df.isna().sum())    

        return new_merchant_df, historical_transactions_df, merchant_df

In [None]:
def basicFeAndGrouping(new_merchant_df, historical_transactions_df, merchant_df):

    """ 
    FUNCTION:
      Perform merging of data, grouping and basic feature engineering 
      STEPS:-
      1. 
      2. 
      3.    

    ARGS:
     Debug:- Takes care of numer of rows as we need faster manipulation of data in dubigging 

    RETURNS:
      Generates the aurgumented train and test 

    """

    with timer("Merging Data frames ...."):
        #Merge the data frames 
        print("[INFO] MERGING THE DATA FRAMES ...")
        historical_transactions_df = pd.merge(historical_transactions_df, merchant_df, on='merchant_id', how = 'left')
        new_merchant_df = pd.merge(new_merchant_df, merchant_df, on='merchant_id',how = 'left')

        #DEL merchant_df
        del merchant_df

        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        #unique values of column of card id 
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())
    with timer("group on card id ..."):
        #Group on card id
        print("[INFO] GROUPING ON CARD ID ...")
        new_merchant_df = group_on_card_id_withMer1(new_merchant_df)
        historical_transactions_df = group_on_card_id_withMer1(historical_transactions_df)


        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        #unique values of column of card id 
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())
    with timer("Changing column names ...."):
        print("[INFO] Changing column names ....")
        historical_transactions_df.columns = ["_".join(col) for col in historical_transactions_df.columns.ravel()]
        historical_transactions_df.rename(columns={'card_id_':'card_id'},inplace=True)
        new_merchant_df.columns = ["_".join(col) for col in new_merchant_df.columns.ravel()]
        new_merchant_df.rename(columns={'card_id_':'card_id'},inplace=True)
    with timer("Train test"):
        df = train_test(None)
        print(df)

    with timer("merge hist and new merchant with  df  "):
        print("[INFO] MERGING ....")
        df = pd.merge(df, historical_transactions_df, on='card_id', how='outer')
        print(df)
        df = pd.merge(df, new_merchant_df, on='card_id', how='outer')
        print(df)
        

    with timer("split train & test"):
        print("[INFO] Split train and test ...")
        train_df = df[df['target'].notnull()]
        test_df = df[df['target'].isnull()]

        del test_df['target']
        del df
        gc.collect()

    with timer("Save train and test files"):
        print("[INFO] SAVING ....")
        train_df.to_csv('augmented_train_try1.csv', index=False)
        test_df.to_csv('augmented_test_try1.csv', index=False)
        return train_df,test_df

In [None]:
def basicFeAndGroupingwithoutMer(new_merchant_df, historical_transactions_df):

    """ 
    FUNCTION:
      Perform merging of data, grouping and basic feature engineering 
      STEPS:-
      1. 
      2. 
      3.    

    ARGS:
     Debug:- Takes care of numer of rows as we need faster manipulation of data in dubigging 

    RETURNS:
      Generates the aurgumented train and test 

    """
    with timer("group on card id ..."):
        #Group on card id
        print("[INFO] GROUPING ON CARD ID ...")
        new_merchant_df = group_on_card_id_withOUTMer1(new_merchant_df)
        historical_transactions_df = group_on_card_id_withOUTMer1(historical_transactions_df)


        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        #unique values of column of card id 
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())

    with timer("Changing column names ...."):
        print("[INFO] Changing column names ....")
        historical_transactions_df.columns = ["_".join(col) for col in historical_transactions_df.columns.ravel()]
        historical_transactions_df.rename(columns={'card_id_':'card_id'},inplace=True)
        new_merchant_df.columns = ["_".join(col) for col in new_merchant_df.columns.ravel()]
        new_merchant_df.rename(columns={'card_id_':'card_id'},inplace=True)

    with timer("Train test"):
        df = train_test(None)
        print(df)

    with timer("merge hist and new merchant with  df  "):
        print("[INFO] MERGING ....")
        df = pd.merge(df, historical_transactions_df, on='card_id', how='outer')
        print(df)
        df = pd.merge(df, new_merchant_df, on='card_id', how='outer')
        print(df)
        

    with timer("split train & test"):
        print("[INFO] Split train and test ...")
        train_df = df[df['target'].notnull()]
        test_df = df[df['target'].isnull()]

        del test_df['target']
        del df
        gc.collect()

    with timer("Save train and test files"):
        print("[INFO] SAVING ....")
        train_df.to_csv('augmented_train_try1_withoutMer.csv', index=False)
        test_df.to_csv('augmented_test_try1_withoutMer.csv', index=False)
        return train_df,test_df

## Main Method

In [None]:
if __name__ == "__main__":
  new_merchant_df,historical_transactions_df,merchant_df = imputations_merAndtrans(debug=False)
  train_df,test_df = basicFeAndGrouping(new_merchant_df, historical_transactions_df, merchant_df)


[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....
Mem. usage decreased to 1138.31 Mb (65.8% reduction)
[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....
Mem. usage decreased to 76.76 Mb (65.8% reduction)
[INFO] THE UNIQUE VALUES OF CARD ID ...
FOR NEW MERCHANT  290001
FOR HISTORICAL TRANSACTIONS  325540
[INFO] Check for Nan
authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
category_2              0
state_id                0
subsector_id            0
price                   0
dtype: int64
authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount        

In [None]:
new_merchant_df

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,price
0,1,C_ID_415bb3a509,107,0,1.0,1,307,M_ID_b0c793002c,1,-0.557617,2018-03-11 14:57:36,1,9,19,-0.557129
1,1,C_ID_415bb3a509,140,0,1.0,1,307,M_ID_88920c89e8,1,-0.569336,2018-03-19 18:53:37,1,9,19,-0.568848
2,1,C_ID_415bb3a509,330,0,1.0,1,507,M_ID_ad5237ef6b,2,-0.551270,2018-04-26 14:08:44,1,9,14,-0.550293
3,1,C_ID_415bb3a509,-1,1,1.0,1,661,M_ID_9e84cda3b1,1,-0.671875,2018-03-07 09:43:21,6,-1,8,-0.671387
4,1,C_ID_ef55cf8d4b,-1,1,1.0,1,166,M_ID_3c86fa3831,1,-0.659668,2018-03-22 21:07:53,6,-1,29,-0.659180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963026,1,C_ID_1320dee851,142,0,0.0,0,309,M_ID_7754b67f3b,2,-0.701660,2018-04-06 14:36:52,3,19,21,-702.000000
1963027,1,C_ID_f112aa3381,158,0,0.0,0,560,M_ID_da063195b7,2,-0.694336,2018-03-07 13:19:18,1,15,34,-694.500000
1963028,1,C_ID_bd97b86450,69,0,1.0,1,278,M_ID_9a9ccb6544,1,-0.621094,2018-03-05 12:04:56,1,9,37,-0.620605
1963029,1,C_ID_c0513fd84f,130,0,0.0,0,367,M_ID_40c28d596f,1,-0.656738,2018-03-09 14:47:05,3,7,16,-656.500000


In [None]:
historical_transactions_df

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,price
0,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-8,-0.703125,2017-06-25 15:33:07,1,16,37,-703.500000
1,1,C_ID_4e6213e9bc,88,0,0.0,0,367,M_ID_86ec983688,-7,-0.732910,2017-07-15 12:10:45,1,16,16,-733.000000
2,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_979ed661fc,-6,-0.720215,2017-08-09 22:04:29,1,16,37,-720.500000
3,1,C_ID_4e6213e9bc,88,0,0.0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1,16,34,-735.500000
4,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-11,-0.722656,2017-03-10 01:14:19,1,16,37,-723.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29112356,1,C_ID_2863d2fa95,-1,1,1.0,1,360,M_ID_edd92b6720,-1,-0.632812,2017-01-20 08:52:04,6,-1,34,-0.631836
29112357,1,C_ID_2863d2fa95,-1,1,1.0,1,360,M_ID_edd92b6720,0,-0.632812,2017-02-20 04:40:50,6,-1,34,-0.631836
29112358,1,C_ID_5c240d6e3c,3,0,0.0,0,278,M_ID_9cdcfe8673,0,-0.657715,2017-12-26 18:37:51,1,16,37,-657.500000
29112359,1,C_ID_5c240d6e3c,331,0,0.0,0,514,M_ID_1a75f94f92,-1,0.770508,2017-11-24 14:18:15,1,16,9,770.500000


In [None]:
merchant_df

Unnamed: 0,merchant_id,merchant_group_id_merchants_t,merchant_category_id_merchants_t,subsector_id_merchants_t,numerical_1_merchants_t,numerical_2_merchants_t,category_1_merchants_t,most_recent_sales_range_merchants_t,most_recent_purchases_range_merchants_t,avg_sales_lag3_merchants_t,avg_purchases_lag3_merchants_t,active_months_lag3_merchants_t,avg_sales_lag6_merchants_t,avg_purchases_lag6_merchants_t,active_months_lag6_merchants_t,avg_sales_lag12_merchants_t,avg_purchases_lag12_merchants_t,active_months_lag12_merchants_t,category_4_merchants_t,city_id_merchants_t,state_id_merchants_t,category_2_merchants_t
0,M_ID_838061e48c,8353,792,9,-0.057465,-0.057465,1,4,4,-0.400000,9.664062,3,-2.250000,18.671875,6,-2.320000,13.914062,12,1,242,9,1
1,M_ID_9339d880ad,3184,840,20,-0.057465,-0.057465,1,4,4,-0.720000,1.750000,3,-0.740000,1.291992,6,-0.570000,1.687500,12,1,22,16,1
2,M_ID_e726bbae1e,447,690,1,-0.057465,-0.057465,1,4,4,-82.129997,260.000000,2,-82.129997,260.000000,2,-82.129997,260.000000,2,1,-1,5,5
3,M_ID_a70e9c5f81,5026,792,9,-0.057465,-0.057465,0,4,4,13.832993,1.666992,3,21.650787,4.667969,6,25.227709,3.833984,12,0,-1,-1,6
4,M_ID_64456c37ce,2228,222,21,-0.057465,-0.057465,0,4,4,13.832993,0.500000,3,21.650787,0.361084,6,25.227709,0.347168,12,0,-1,-1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334691,M_ID_1f4773aa76,1145,705,33,3.173828,-0.047546,1,0,0,1.000000,1.022461,3,0.990000,1.019531,6,1.000000,1.024414,12,0,69,9,1
334692,M_ID_725a60d404,35,544,29,-0.057465,-0.057465,0,0,0,0.890000,0.927734,3,0.780000,0.813477,6,0.590000,0.606934,12,0,-1,-1,6
334693,M_ID_f2045dd267,35,561,7,-0.057465,-0.057465,1,0,0,0.960000,0.982910,3,0.900000,0.924805,6,0.740000,0.750977,8,0,160,21,5
334694,M_ID_9139332ccc,35,511,7,-0.057465,-0.057465,0,0,0,0.940000,0.919434,3,0.820000,0.783203,6,0.650000,0.583984,12,0,-1,-1,6


In [None]:
!ls

 augmented_test.csv
 augmented_test_try1.csv
 augmented_test_try1_model_imputation.csv
 augmented_train.csv
 augmented_train_try1.csv
 augmented_train_try1_model_imputation.csv
 clf_cat2.sav
 clf_cat3.sav
 clf_name.sav
'Data Dictionary.xlsx'
 Data_Dictionary.xlsx
 file_name.csv
 hist_authorized_flag_enc.npy
 hist_category_1_enc.npy
 hist_category_3_enc.npy
 historical_transactions.csv
 historical_transactions_df.pkl
 knn_sal_12.sav
 knn_sal_3.sav
 knn_sal_6.sav
 LR_cat_2.sav
 mer_authorized_flag_enc.npy
 mer_category_1_enc.npy
 mer_category_3_enc.npy
 merchant_category_1_enc.npy
 merchant_category_4_enc.npy
 merchant_most_recent_purchases_range_enc.npy
 merchant_most_recent_sales_range_enc.npy
 merchants.csv
 merged_transactions_with_merchants.csv
 new_merchant_authorized_flag_enc.npy
 new_merchant_category_1_enc.npy
 new_merchant_df.pkl
 new_merchant_transactions.csv
 sample_submission.csv
 test.csv
 train.csv
 transactions_raw_merged.csv
 transactions_refined_1_merged_WON.csv


## Load the dataset 

In [None]:
train_df

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,target,outliers,quarter,elapsed_time,quarter_first_active_month,first_active_month_diff_from_today,feature_sum,feature_mean,feature_max,feature_min,feature_std,authorized_flag_sum_x,authorized_flag_mean_x,authorized_flag_mode_x,category_1_sum_x,category_1_mean_x,category_1_mode_x,category_2_sum_x,category_2_mean_x,category_2_mode_x,category_3_sum_x,category_3_mean_x,category_3_mode_x,category_1_merchants_t_sum_x,category_1_merchants_t_mean_x,category_1_merchants_t_mode_x,most_recent_sales_range_merchants_t_sum_x,most_recent_sales_range_merchants_t_mean_x,most_recent_sales_range_merchants_t_mode_x,category_4_merchants_t_sum_x,category_4_merchants_t_mean_x,category_4_merchants_t_mode_x,most_recent_purchases_range_merchants_t_sum_x,most_recent_purchases_range_merchants_t_mean_x,most_recent_purchases_range_merchants_t_mode_x,...,numerical_1_merchants_t_var_y,numerical_1_merchants_t_skew_y,active_months_lag6_merchants_t_sum_y,active_months_lag6_merchants_t_mean_y,active_months_lag6_merchants_t_min_y,active_months_lag6_merchants_t_max_y,active_months_lag6_merchants_t_var_y,active_months_lag6_merchants_t_skew_y,active_months_lag12_merchants_t_sum_y,active_months_lag12_merchants_t_mean_y,active_months_lag12_merchants_t_min_y,active_months_lag12_merchants_t_max_y,active_months_lag12_merchants_t_var_y,active_months_lag12_merchants_t_skew_y,weekend_nunique_y,weekend_mean_y,weekend_min_y,weekend_max_y,weekday_nunique_y,weekday_mean_y,weekday_min_y,weekday_max_y,hour_nunique_y,hour_mean_y,hour_min_y,hour_max_y,weekofyear_nunique_y,weekofyear_mean_y,weekofyear_min_y,weekofyear_max_y,day_nunique_y,day_mean_y,day_min_y,day_max_y,purchase_date_ptp_y,purchase_date_min_y,purchase_date_max_y,month_sum_y,month_mean_y,month_nunique_y
0,C_ID_92a2005557,2017-06-01,1.0,1.0,1.0,-0.820283,1.0,2.0,1310.0,2.0,1310.0,3.0,1.0,1.0,1.0,0.0,257.0,0.948242,1,0.0,0.000000,0,283.0,1.043945,1,4.0,0.014763,0,235.0,0.867188,1,451.0,1.664062,0,12.0,0.044281,0,390.0,1.439453,0,...,3156.000000,inf,137.0,5.957031,5.0,6.0,0.043488,-4.796875,267.0,11.609375,5.0,12.0,2.248047,-4.304688,2.0,0.260986,0.0,1.0,7.0,3.130859,0.0,6.0,8.0,12.867188,8.0,16.0,7.0,13.304688,10.0,17.0,17.0,16.437500,5.0,31.0,54 days 21:18:29,2018-03-05 14:04:36,2018-04-29 11:23:05,80.0,3.478516,2.0
1,C_ID_3d0044924f,2017-01-01,1.0,1.0,1.0,0.392913,1.0,1.0,1461.0,1.0,1461.0,3.0,1.0,1.0,1.0,0.0,354.0,0.969727,1,35.0,0.095886,0,540.0,1.479492,1,448.0,1.227539,1,303.0,0.830078,1,392.0,1.074219,0,29.0,0.079468,0,389.0,1.065430,0,...,1.946289,2.447266,36.0,6.000000,6.0,6.0,0.000000,0.000000,72.0,12.000000,12.0,12.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,4.0,1.500000,0.0,4.0,5.0,11.164062,6.0,17.0,4.0,9.000000,5.0,13.0,4.0,13.500000,1.0,30.0,56 days 13:40:32,2018-02-01 17:07:54,2018-03-30 06:48:26,15.0,2.500000,2.0
2,C_ID_d639edf6cd,2016-08-01,1.0,1.0,1.0,0.688056,1.0,3.0,1614.0,3.0,1614.0,3.0,1.0,1.0,1.0,0.0,42.0,0.954590,1,0.0,0.000000,0,200.0,4.546875,5,0.0,0.000000,0,42.0,0.954590,1,16.0,0.363525,0,3.0,0.068176,0,16.0,0.363525,0,...,,,6.0,6.000000,6.0,6.0,,,10.0,10.000000,10.0,10.0,,,1.0,1.000000,1.0,1.0,1.0,5.000000,5.0,5.0,1.0,17.000000,17.0,17.0,1.0,17.000000,17.0,17.0,1.0,28.000000,28.0,28.0,0 days 00:00:00,2018-04-28 17:43:11,2018-04-28 17:43:11,4.0,4.000000,1.0
3,C_ID_186d6a6901,2017-09-01,1.0,1.0,1.0,0.142495,1.0,3.0,1218.0,3.0,1218.0,3.0,1.0,1.0,1.0,0.0,89.0,1.000000,1,13.0,0.146118,0,322.0,3.617188,4,101.0,1.134766,1,59.0,0.663086,1,143.0,1.606445,0,2.0,0.022476,0,142.0,1.595703,0,...,0.112488,1.932617,42.0,6.000000,6.0,6.0,0.000000,0.000000,84.0,12.000000,12.0,12.0,0.000000,0.000000,2.0,0.428467,0.0,1.0,4.0,3.285156,1.0,6.0,5.0,13.000000,7.0,21.0,5.0,13.859375,10.0,16.0,7.0,13.140625,4.0,24.0,41 days 23:05:05,2018-03-07 11:55:06,2018-04-18 11:00:11,26.0,3.714844,2.0
4,C_ID_cdbd2c0db2,2017-11-01,1.0,1.0,1.0,-0.159749,1.0,4.0,1157.0,4.0,1157.0,3.0,1.0,1.0,1.0,0.0,138.0,0.964844,1,16.0,0.111877,0,554.0,3.875000,4,150.0,1.048828,1,107.0,0.748047,1,275.0,1.922852,3,7.0,0.048950,0,278.0,1.944336,3,...,21.562500,4.210938,215.0,5.972656,5.0,6.0,0.027771,-6.000000,425.0,11.804688,5.0,12.0,1.361328,-6.000000,2.0,0.333252,0.0,1.0,7.0,3.277344,0.0,6.0,14.0,14.718750,5.0,23.0,8.0,13.359375,9.0,17.0,22.0,14.585938,1.0,31.0,57 days 06:54:42,2018-03-02 11:55:43,2018-04-28 18:50:25,128.0,3.554688,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201912,C_ID_963962de2c,2017-09-01,1.0,1.0,1.0,-2.740821,1.0,3.0,1218.0,3.0,1218.0,3.0,1.0,1.0,1.0,0.0,53.0,0.898438,1,0.0,0.000000,0,123.0,2.083984,3,2.0,0.033905,0,32.0,0.542480,1,122.0,2.068359,2,17.0,0.288086,0,117.0,1.983398,2,...,,,6.0,6.000000,6.0,6.0,,,12.0,12.000000,12.0,12.0,,,1.0,1.000000,1.0,1.0,1.0,6.000000,6.0,6.0,1.0,5.000000,5.0,5.0,1.0,10.000000,10.0,10.0,1.0,11.000000,11.0,11.0,0 days 00:00:00,2018-03-11 05:42:27,2018-03-11 05:42:27,3.0,3.000000,1.0
201913,C_ID_1314773c0b,2015-10-01,1.0,1.0,1.0,0.312917,1.0,4.0,1919.0,4.0,1919.0,3.0,1.0,1.0,1.0,0.0,40.0,0.851074,1,0.0,0.000000,0,47.0,1.000000,1,1.0,0.021271,0,46.0,0.978516,1,99.0,2.105469,1,46.0,0.978516,1,91.0,1.936523,1,...,,,6.0,6.000000,6.0,6.0,,,12.0,12.000000,12.0,12.0,,,1.0,0.000000,0.0,0.0,1.0,1.000000,1.0,1.0,1.0,13.000000,13.0,13.0,1.0,12.000000,12.0,12.0,1.0,20.000000,20.0,20.0,0 days 00:00:00,2018-03-20 13:17:07,2018-03-20 13:17:07,3.0,3.000000,1.0
201914,C_ID_7666735b3d,2017-08-01,1.0,1.0,1.0,0.093494,1.0,3.0,1249.0,3.0,1249.0,3.0,1.0,1.0,1.0,0.0,83.0,0.932617,1,8.0,0.089905,0,301.0,3.382812,3,101.0,1.134766,1,65.0,0.730469,1,145.0,1.628906,1,23.0,0.258545,0,150.0,1.685547,2,...,0.000524,1.728516,18.0,6.000000,6.0,6.0,0.000000,0.000000,36.0,12.000000,12.0,12.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,3.0,1.666992,0.0,3.0,3.0,14.000000,10.0,21.0,2.0,12.664062,12.0,13.0,3.0,25.328125,22.0,28.0,5 days 14:20:58,2018-03-22 21:23:21,2018-03-28 11:44:19,9.0,3.000000,1.0
201915,C_ID_73f5a0efd0,2016-07-01,1.0,1.0,1.0,-4.676589,1.0,3.0,1645.0,3.0,1645.0,3.0,1.0,1.0,1.0,0.0,25.0,0.893066,1,0.0,0.000000,0,28.0,1.000000,1,0.0,0.000000,0,24.0,0.856934,1,27.0,0.964355,0,1.0,0.035706,0,26.0,0.928711,0,...,324.000000,2.232422,30.0,6.000000,6.0,6.0,0.000000,0.000000,60.0,12.000000,12.0,12.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,3.0,2.000000,1.0,4.0,4.0,14.203125,11.0,20.0,5.0,48.187500,45.0,52.0,5.0,12.398438,1.0,27.0,49 days 18:59:03,2017-11-07 16:20:46,2017-12-27 11:19:49,58.0,11.601562,2.0


In [None]:
test_df

Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,outliers,quarter,elapsed_time,quarter_first_active_month,first_active_month_diff_from_today,feature_sum,feature_mean,feature_max,feature_min,feature_std,authorized_flag_sum_x,authorized_flag_mean_x,authorized_flag_mode_x,category_1_sum_x,category_1_mean_x,category_1_mode_x,category_2_sum_x,category_2_mean_x,category_2_mode_x,category_3_sum_x,category_3_mean_x,category_3_mode_x,category_1_merchants_t_sum_x,category_1_merchants_t_mean_x,category_1_merchants_t_mode_x,most_recent_sales_range_merchants_t_sum_x,most_recent_sales_range_merchants_t_mean_x,most_recent_sales_range_merchants_t_mode_x,category_4_merchants_t_sum_x,category_4_merchants_t_mean_x,category_4_merchants_t_mode_x,most_recent_purchases_range_merchants_t_sum_x,most_recent_purchases_range_merchants_t_mean_x,most_recent_purchases_range_merchants_t_mode_x,category_2_merchants_t_sum_x,...,numerical_1_merchants_t_var_y,numerical_1_merchants_t_skew_y,active_months_lag6_merchants_t_sum_y,active_months_lag6_merchants_t_mean_y,active_months_lag6_merchants_t_min_y,active_months_lag6_merchants_t_max_y,active_months_lag6_merchants_t_var_y,active_months_lag6_merchants_t_skew_y,active_months_lag12_merchants_t_sum_y,active_months_lag12_merchants_t_mean_y,active_months_lag12_merchants_t_min_y,active_months_lag12_merchants_t_max_y,active_months_lag12_merchants_t_var_y,active_months_lag12_merchants_t_skew_y,weekend_nunique_y,weekend_mean_y,weekend_min_y,weekend_max_y,weekday_nunique_y,weekday_mean_y,weekday_min_y,weekday_max_y,hour_nunique_y,hour_mean_y,hour_min_y,hour_max_y,weekofyear_nunique_y,weekofyear_mean_y,weekofyear_min_y,weekofyear_max_y,day_nunique_y,day_mean_y,day_min_y,day_max_y,purchase_date_ptp_y,purchase_date_min_y,purchase_date_max_y,month_sum_y,month_mean_y,month_nunique_y
201917,C_ID_0ab67a22ab,2017-04-01,1.0,1.0,1.0,,2.0,1371.0,2.0,1371.0,3.0,1.0,1.0,1.0,0.0,47.0,0.662109,1,23.0,0.323975,0,186.0,2.619141,1,93.0,1.309570,1,42.0,0.591309,1,116.0,1.633789,0,36.0,0.506836,1,118.0,1.662109,0,216.0,...,0.000131,1.730469,18.0,6.0,6.0,6.0,0.0,0.0,36.0,12.000000,12.0,12.0,0.000000,0.000000,2.0,0.333252,0.0,1.0,3.0,3.666016,2.0,5.0,3.0,13.664062,9.0,19.0,3.0,7.000000,5.0,9.0,3.0,15.664062,3.0,28.0,25 days 09:24:23,2018-02-03 09:44:29,2018-02-28 19:08:52,6.0,2.000000,1.0
201918,C_ID_130fd0cbdd,2017-01-01,1.0,1.0,1.0,,1.0,1461.0,1.0,1461.0,3.0,1.0,1.0,1.0,0.0,77.0,0.987305,1,2.0,0.025635,0,315.0,4.039062,4,80.0,1.025391,1,76.0,0.974121,1,130.0,1.666992,2,75.0,0.961426,1,127.0,1.627930,2,316.0,...,1.009766,3.162109,60.0,6.0,6.0,6.0,0.0,0.0,111.0,11.101562,7.0,12.0,3.656250,-1.845703,2.0,0.300049,0.0,1.0,6.0,2.699219,0.0,6.0,8.0,15.703125,9.0,23.0,6.0,12.296875,9.0,16.0,7.0,11.398438,3.0,20.0,48 days 05:41:29,2018-03-03 12:18:48,2018-04-20 18:00:17,34.0,3.400391,2.0
201919,C_ID_b709037bc5,2017-08-01,1.0,1.0,1.0,,3.0,1249.0,3.0,1249.0,3.0,1.0,1.0,1.0,0.0,9.0,0.692383,1,1.0,0.076904,0,62.0,4.769531,5,28.0,2.154297,2,12.0,0.922852,1,25.0,1.922852,2,2.0,0.153809,0,35.0,2.691406,3,66.0,...,0.053528,,12.0,6.0,6.0,6.0,0.0,,24.0,12.000000,12.0,12.0,0.000000,,1.0,0.000000,0.0,0.0,2.0,2.000000,1.0,3.0,2.0,13.500000,13.0,14.0,2.0,10.000000,9.0,11.0,2.0,7.000000,1.0,13.0,11 days 22:19:13,2018-03-01 14:51:33,2018-03-13 13:10:46,6.0,3.000000,1.0
201920,C_ID_d27d835a9f,2017-12-01,1.0,1.0,1.0,,4.0,1127.0,4.0,1127.0,3.0,1.0,1.0,1.0,0.0,26.0,1.000000,1,0.0,0.000000,0,26.0,1.000000,1,37.0,1.422852,1,26.0,1.000000,1,35.0,1.345703,2,0.0,0.000000,0,36.0,1.384766,0,26.0,...,0.458496,2.052734,60.0,6.0,6.0,6.0,0.0,0.0,120.0,12.000000,12.0,12.0,0.000000,0.000000,2.0,0.300049,0.0,1.0,5.0,3.199219,1.0,6.0,7.0,18.203125,11.0,21.0,7.0,12.101562,9.0,16.0,8.0,13.601562,4.0,31.0,44 days 10:15:54,2018-03-04 11:06:29,2018-04-17 21:22:23,33.0,3.300781,2.0
201921,C_ID_2b5e3df5c2,2015-12-01,1.0,1.0,1.0,,4.0,1858.0,4.0,1858.0,3.0,1.0,1.0,1.0,0.0,90.0,0.796387,1,0.0,0.000000,0,426.0,3.769531,4,120.0,1.061523,1,107.0,0.946777,1,252.0,2.230469,3,7.0,0.061951,0,257.0,2.273438,3,461.0,...,1.550781,2.179688,36.0,6.0,6.0,6.0,0.0,0.0,67.0,11.164062,8.0,12.0,2.566406,-2.148438,2.0,0.333252,0.0,1.0,4.0,4.000000,2.0,6.0,5.0,8.000000,0.0,16.0,3.0,10.500000,9.0,15.0,5.0,7.332031,4.0,12.0,39 days 09:11:34,2018-03-04 03:01:37,2018-04-12 12:13:11,19.0,3.166016,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325535,C_ID_7a239d2eda,2017-10-01,1.0,1.0,1.0,,4.0,1188.0,4.0,1188.0,3.0,1.0,1.0,1.0,0.0,77.0,0.962402,1,0.0,0.000000,0,450.0,5.625000,6,0.0,0.000000,0,68.0,0.850098,1,145.0,1.812500,1,68.0,0.850098,1,143.0,1.787109,1,480.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,,,
325536,C_ID_75ace375ae,2017-09-01,1.0,1.0,1.0,,3.0,1218.0,3.0,1218.0,3.0,1.0,1.0,1.0,0.0,9.0,1.000000,1,0.0,0.000000,0,13.0,1.444336,1,0.0,0.000000,0,4.0,0.444336,0,17.0,1.888672,0,4.0,0.444336,0,16.0,1.777344,2,34.0,...,0.363281,1.911133,24.0,6.0,6.0,6.0,0.0,0.0,48.0,12.000000,12.0,12.0,0.000000,0.000000,2.0,0.750000,0.0,1.0,3.0,4.500000,2.0,6.0,2.0,15.750000,9.0,18.0,2.0,10.750000,9.0,16.0,3.0,7.000000,3.0,18.0,45 days 15:54:17,2018-03-03 18:04:31,2018-04-18 09:58:48,13.0,3.250000,2.0
325537,C_ID_21d56d950c,2016-09-01,1.0,1.0,1.0,,3.0,1583.0,3.0,1583.0,3.0,1.0,1.0,1.0,0.0,36.0,0.973145,1,3.0,0.081055,0,184.0,4.972656,5,50.0,1.351562,1,32.0,0.864746,1,61.0,1.648438,0,3.0,0.081055,0,62.0,1.675781,0,190.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NaT,NaT,NaT,,,
325538,C_ID_6c46fc5a9d,2017-06-01,1.0,1.0,1.0,,2.0,1310.0,2.0,1310.0,3.0,1.0,1.0,1.0,0.0,46.0,0.707520,1,0.0,0.000000,0,181.0,2.785156,3,99.0,1.523438,2,59.0,0.907715,1,117.0,1.799805,1,54.0,0.830566,1,107.0,1.646484,1,213.0,...,28.921875,1.958008,36.0,6.0,6.0,6.0,0.0,0.0,72.0,12.000000,12.0,12.0,0.000000,0.000000,2.0,0.333252,0.0,1.0,5.0,3.166016,0.0,6.0,4.0,8.000000,0.0,18.0,3.0,45.156250,44.0,47.0,5.0,10.335938,1.0,26.0,25 days 00:00:00,2017-11-01 00:00:00,2017-11-26 00:00:00,66.0,11.000000,1.0


## Start Exploring 

In [None]:
train_df.isnull().sum(axis = 0)

card_id                    0
first_active_month         0
feature_1                  0
feature_2                  0
feature_3                  0
                       ...  
purchase_date_min_y    21931
purchase_date_max_y    21931
month_sum_y            21931
month_mean_y           21931
month_nunique_y        21931
Length: 330, dtype: int64

### Checking for percentage of null values 

In [None]:
#Gives the name of all the variables with missing data 

columns_with_na = [var for var in train_df.columns if train_df[var].isnull().mean()  > 0]

In [None]:
len(columns_with_na)

168

In [None]:
#lets find out the percentage of observations missing per variable

#calculate the percentage of missing 
data_na = train_df[columns_with_na].isnull().mean()

#transform the array to dataframe 
data_na = pd.DataFrame(data_na.reset_index())

#add names to the dataframe 
data_na.columns = ['col','percentage_na']

#oreder the dataframe acc to percentage 
data_na.sort_values(by = 'percentage_na',ascending = False, inplace = True)

#show
data_na

Unnamed: 0,col,percentage_na
123,avg_purchases_lag12_merchants_t_skew_y,0.385956
111,avg_purchases_lag6_merchants_t_skew_y,0.385223
129,numerical_1_merchants_t_skew_y,0.349005
81,numerical_2_merchants_t_skew_y,0.348802
93,avg_purchases_lag3_merchants_t_skew_y,0.345627
...,...,...
10,active_months_lag12_merchants_t_skew_x,0.000168
9,active_months_lag6_merchants_t_skew_x,0.000168
6,avg_sales_lag12_merchants_t_skew_x,0.000168
4,avg_sales_lag6_merchants_t_skew_x,0.000168


In [None]:
na = data_na[data_na.percentage_na>0.1].col

In [None]:
print(len(na))

159
