<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/Preprocessing%20and%20Feature%20Engineering/Preprocessing_ModelBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [51]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode
import pickle
from sklearn.linear_model import LogisticRegression

## Mounting Drive

In [2]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Set up the path

In [None]:
%cd /content/drive/MyDrive/data

/content/drive/MyDrive/data


### View Current Items in the folder

In [None]:
!ls

## Helper Functions

### Timer

In [3]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI
@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

### Memory Reduction

In [70]:

#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Imputation 

### Problem Summary 
1. There are two main datasets that contain the card id and the target variable loyality score to predict - Train and Test
2. There are two datasets that contain informations about all the transactions buying from different merchants - Historical Transations and New merchants transactions. 
3. There is 1 dataset that contains informations about all the Merchants - Merchant 

If we need to prepare a dataset for machine learning we need to fix missing values, and we can fix missing values by applying machine learning to that dataset! If we consider a column with missing data as our target variable, and existing columns with complete data as our predictor variables, then we may construct a machine learning model using complete records as our train and test datasets and the records with incomplete data as our generalization target

### Train and Test
1. From Eda we saw that there is no null value in train or test so no imputation required 


### New Merchants Transactions Data
1. In New Merchant Transactions we saw that there are columns with missing values such as category 3, category 2 and merchant id



In [71]:
def preprocessing_new_mer_transactions(nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT HANDLES MISSING VALUES BY MODEL IMPUTATIONS 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 

    REFER:-https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
    """

    #load the dataset 
    df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=nrows)


    print(df.isna().sum())

    #category 2 and 3 have missing values and we will handle them via model based impuation 

    #creating a dataframe 
    temp = pd.DataFrame()

    #get the card id
    temp['card_id'] = df['card_id']

    #get the merchant id
    temp['merchant_id'] = df['merchant_id']

    #get the purchase date
    temp['purchase_date'] = df['purchase_date']

    #drop the columns from the new merchant data frame 
    df.drop(['card_id', 'merchant_id', 'purchase_date'], axis=1, inplace=True)

    gc.collect()

    #store all the columns as features 
    feat = df.columns

    #print the dataframe 
    print(df.sample(10))

    # having missing values 
    cols = ['category_2', 'category_3']

    #label encode the variables
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0})
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0})
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2})

    print(df.sample(10))

    #list to hold the null values
    list_no_nan = []

    #select only columns which doesn't have any null values
    for c in feat: # from all the features in the dataframe 
      if c not in cols: #cols have missing value 
        list_no_nan.append(c)

    #create a test set by selecting only rows which are having null values
    #we will predict the values of categority 2
    test = df[df['category_2'].isna()]

    #create train set by selecting rows which doesn't have any null values
    train = df.dropna()
    
    df_name = 'new_mer '
    #fit the classifier to the train data
    clf_name = df_name + 'LR1'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()

    # fit on data 
    # list no nan doesnot have category 2 and 3, we need to predict the values of 
    # we need to predict the values of category 2 here 
    # so we train on all the data having no null values and we have to predict the category 2 here 
    # in train category 2 we have no null value 
    clf_name.fit(train[list_no_nan], train['category_2'])

    #save the model in pickel 
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 2 values ....")
    #make prediction only for the rows with null value
    # so now to ake a predictinon we had made a test set containing the rows where 
    # category 2 has na values 
    # so we will try to predict the na values from the trained model to make predictions 

    df.loc[df['category_2'].isna(), 'category_2'] = clf_name.predict(test[list_no_nan])

    # print the dataframe 
    print(df.sample(10))


    # Now the same way for category 3 variable as we did for category 2
    test = df[df['category_3'].isna()]
    train = df.dropna()

    clf_name = df_name + 'LR2'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()
    clf_name.fit(train[list_no_nan], train['category_3'])
    #save the model in pickel
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 3 values ....")
    df.loc[df['category_3'].isna(), 'category_3'] = clf_name.predict(test[list_no_nan])

    print(df.sample(10))

    df['card_id'] = temp['card_id']
    df['merchant_id'] = temp['merchant_id']
    df['purchase_date'] = temp['purchase_date']

    print(df.isna().sum())

    #FILL MISSING MERCHANT IDS 
    df['merchant_id'].fillna('M_ID_0000000000', inplace=True)

    print(df.isna().sum())

    # reduce memory usage
    df = reduce_mem_usage(df)


    return df

### Historical Transactions data 
1. In Historical Transactions data we saw that there are columns with missing values such as category 3, category 2 and merchant id

In [72]:
def preprocessing_hist_transactions(nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT HANDLES MISSING VALUES BY MODEL IMPUTATIONS 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
           
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 

    REFER:-https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
    """

    #load the dataset 
    df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=None)

    print(df.isna().sum())

    #category 2 and 3 have missing values and we will handle them via model based impuation 

    #creating a dataframe 
    temp = pd.DataFrame()

    #get the card id
    temp['card_id'] = df['card_id']

    #get the merchant id
    temp['merchant_id'] = df['merchant_id']

    #get the purchase date
    temp['purchase_date'] = df['purchase_date']

    #drop the columns from the new merchant data frame 
    df.drop(['card_id', 'merchant_id', 'purchase_date'], axis=1, inplace=True)

    gc.collect()

    #store all the columns as features 
    feat = df.columns

    #print the dataframe 
    print(df.sample(10))

    # having missing values 
    cols = ['category_2', 'category_3']

    #label encode the variables(as int caused a lot of trouble)
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0})
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0})
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2})

    print(df.sample(10))

    #list to hold the null values
    list_no_nan = []

    #select only columns which doesn't have any null values
    for c in feat: # from all the features in the dataframe 
      if c not in cols: #cols have missing value 
        list_no_nan.append(c)

    #create a test set by selecting only rows which are having null values
    #we will predict the values of categority 2
    test = df[df['category_2'].isna()]

    #create train set by selecting rows which doesn't have any null values
    train = df.dropna()
    
    df_name = 'hist '
    #fit the classifier to the train data
    clf_name = df_name + 'LR1'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()

    # fit on data 
    # list no nan doesnot have category 2 and 3, we need to predict the values of 
    # we need to predict the values of category 2 here 
    # so we train on all the data having no null values and we have to predict the category 2 here 
    # in train category 2 we have no null value 
    clf_name.fit(train[list_no_nan], train['category_2'])

    #save the model in pickel 
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 2 values ....")
    #make prediction only for the rows with null value
    # so now to ake a predictinon we had made a test set containing the rows where 
    # category 2 has na values 
    # so we will try to predict the na values from the trained model to make predictions 

    df.loc[df['category_2'].isna(), 'category_2'] = clf_name.predict(test[list_no_nan])

    # print the dataframe 
    print(df.sample(10))


    # Now the same way for category 3 variable as we did for category 2
    test = df[df['category_3'].isna()]
    train = df.dropna()

    clf_name = df_name + 'LR2'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()
    clf_name.fit(train[list_no_nan], train['category_3'])
    #save the model in pickel
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 3 values ....")
    #loads nly the category 3 nan values 
    df.loc[df['category_3'].isna(), 'category_3'] = clf_name.predict(test[list_no_nan])

    print(df.sample(10))

    df['card_id'] = temp['card_id']
    df['merchant_id'] = temp['merchant_id']
    df['purchase_date'] = temp['purchase_date']

    print(df.isna().sum())

    #FILL MISSING MERCHANT IDS 
    df['merchant_id'].fillna('M_ID_0000000000', inplace=True)

    print(df.isna().sum())

    # reduce memory usage
    df = reduce_mem_usage(df)


    return df

### Merchants Data
1. In Merchants data we saw that average_sales_lag3, average_sales_lag6, average_sales_lag12 and category 2 have missing values 

In [73]:

def merchant_preprocessing(num_rows=None):
    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL
          IT HANDLES THE INF VALUES  
          PHASE 1 OF NUMERICAL (SALES)
          PHASE 2 OF CATEGORICAL (CATRGORY 2 )

    ARGS:
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE
    
    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 

    REFER:-https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
    """

    import pickle
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsRegressor

    df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)


    features_inf = ["avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"]
    for col in features_inf:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])


    #creating a temporary dataframe 
    temp = pd.DataFrame()
    temp['merchant_id'] = df['merchant_id']

    # we use KNN for numerical and logisticregrssion for categorical 
    temp['category_2'] = df['category_2']

    #category 2 is needed for 2nd round for logistic regression
    # merchant id is not useful
    df.drop(['merchant_id', 'category_2'], axis=1, inplace=True)

    #categorical to numerical (do not do as int or face the wrath of nan error )
    df['category_1'] = df['category_1'].map({'Y': 0, 'N': 1})
    df['most_recent_sales_range'] = df['most_recent_sales_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4})
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4})
    df['category_4'] = df['category_4'].map({'Y': 0, 'N': 1, 2 : 2})

    features = df.columns
    #sales have missing values 
    cols = ['avg_sales_lag3','avg_sales_lag6','avg_sales_lag12']
    list_no_nan = []

    #get the features not to predict and having no nan 
    for c in features:
      if c not in cols:
        list_no_nan.append(c)

    #have all nan as we need to predict this (avg_sales_lag3)
    test = df[df['avg_sales_lag3'].isna()]
    #have no nan to train 
    train = df.dropna()

    #initilize for knn 
    knn_sal_3 = KNeighborsRegressor(n_neighbors=5)

    #fit on data 
    #train on the feeatures and to predict on sales 
    knn_sal_3.fit(train[list_no_nan], train['avg_sales_lag3'])
    #fill the missing values 
    df.loc[df['avg_sales_lag3'].isna(), 'avg_sales_lag3'] = knn_sal_3.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_3, open('knn_sal_3.sav', 'wb'))

    #have all nan as we need to predict this (avg_sales_lag6)
    test = df[df['avg_sales_lag6'].isna()]
    train = df.dropna()

    #initilize the model
    knn_sal_6 = KNeighborsRegressor(n_neighbors=5)
    #fit the model 
    knn_sal_6.fit(train[list_no_nan], train['avg_sales_lag6'])
    #fill the missing values 
    df.loc[df['avg_sales_lag6'].isna(), 'avg_sales_lag6'] = knn_sal_6.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_6, open('knn_sal_6.sav', 'wb'))

    #have all nan as we need to predict this (avg_sales_lag12)
    test = df[df['avg_sales_lag12'].isna()]
    train = df.dropna()

    #initilize the model
    knn_sal_12 = KNeighborsRegressor(n_neighbors=5)
    #fit on data 
    knn_sal_12.fit(train[list_no_nan], train['avg_sales_lag12'])
    #fill the missing values 
    df.loc[df['avg_sales_lag12'].isna(), 'avg_sales_lag12'] = knn_sal_12.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_12, open('knn_sal_12.sav', 'wb'))






    #for category 2
    df['category_2'] = temp['category_2']

    feat = df.columns
    cols = ['category_2']
    list_no_nan = []

    #get the features not to predict and having no nan 
    for c in feat:
      if c not in cols:
        list_no_nan.append(c)

    #get all the missing values 
    test = df[df['category_2'].isna()]
    # have all the good values 
    train = df.dropna()

    LR_cat_2 = LogisticRegression()
    LR_cat_2.fit(train[list_no_nan], train['category_2'])
    df.loc[df['category_2'].isna(), 'category_2'] = LR_cat_2.predict(test[list_no_nan])
    pickle.dump(LR_cat_2, open('LR_cat_2.sav', 'wb'))

    df['merchant_id'] = temp['merchant_id']

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

## Main Function

In [74]:
def main(debug, csvConvert = True):
  if debug is True:
    num_rows = 1000
  else:
    num_rows = None
  with timer('new Merchants'):
    new_merchants = preprocessing_new_mer_transactions(num_rows)
  with timer('hist trans'):
    hist_transactions = preprocessing_hist_transactions(num_rows)
  with timer('merchants'):
    merchants = merchant_preprocessing(num_rows)

  if csvConvert == True:
    with timer('To CSV '):
      new_merchants.to_csv('newMerchants_trans_preprocessed_model_imputation.csv', index=False)
      hist_transactions.to_csv('hist_trans_preprocessed_model_imputation.csv', index=False)
      merchants.to_csv('merchants_preprocessed_model_imputation.csv', index=False)

  return new_merchants, hist_transactions, merchants
  

In [75]:
if __name__ == "__main__": 
  a, b, c = main(False)

authorized_flag              0
card_id                      0
city_id                      0
category_1                   0
installments                 0
category_3               55922
merchant_category_id         0
merchant_id              26216
month_lag                    0
purchase_amount              0
purchase_date                0
category_2              111745
state_id                     0
subsector_id                 0
dtype: int64
        authorized_flag  city_id category_1  ...  category_2 state_id  subsector_id
1886673               Y      310          N  ...         5.0        5            27
6833                  Y       56          N  ...         5.0       20            37
323010                Y      321          N  ...         1.0       12            27
1034649               Y      161          N  ...         3.0        3            25
825055                Y       69          N  ...         1.0        9            27
1102361               Y       57          N  ... 

In [76]:
!ls

clf_name.sav
drive
hist_trans_preprocessed_model_imputation.csv
knn_sal_12.sav
knn_sal_3.sav
knn_sal_6.sav
LR_cat_2.sav
merchants_preprocessed_model_imputation.csv
new_merchant_authorized_flag_enc.npy
new_merchant_category_1_enc.npy
newMerchants_trans_preprocessed_model_imputation.csv
sample_data


In [77]:
a

Unnamed: 0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id,card_id,merchant_id,purchase_date
0,1,107,0,1,1.0,307,1,-0.557617,1.0,9,19,C_ID_415bb3a509,M_ID_b0c793002c,2018-03-11 14:57:36
1,1,140,0,1,1.0,307,1,-0.569336,1.0,9,19,C_ID_415bb3a509,M_ID_88920c89e8,2018-03-19 18:53:37
2,1,330,0,1,1.0,507,2,-0.551270,1.0,9,14,C_ID_415bb3a509,M_ID_ad5237ef6b,2018-04-26 14:08:44
3,1,-1,1,1,1.0,661,1,-0.671875,1.0,-1,8,C_ID_415bb3a509,M_ID_9e84cda3b1,2018-03-07 09:43:21
4,1,-1,1,1,1.0,166,1,-0.659668,1.0,-1,29,C_ID_ef55cf8d4b,M_ID_3c86fa3831,2018-03-22 21:07:53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963026,1,142,0,0,0.0,309,2,-0.701660,3.0,19,21,C_ID_1320dee851,M_ID_7754b67f3b,2018-04-06 14:36:52
1963027,1,158,0,0,0.0,560,2,-0.694336,1.0,15,34,C_ID_f112aa3381,M_ID_da063195b7,2018-03-07 13:19:18
1963028,1,69,0,1,1.0,278,1,-0.621094,1.0,9,37,C_ID_bd97b86450,M_ID_9a9ccb6544,2018-03-05 12:04:56
1963029,1,130,0,0,0.0,367,1,-0.656738,3.0,7,16,C_ID_c0513fd84f,M_ID_40c28d596f,2018-03-09 14:47:05


In [78]:
a.isna().sum()

authorized_flag         0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
month_lag               0
purchase_amount         0
category_2              0
state_id                0
subsector_id            0
card_id                 0
merchant_id             0
purchase_date           0
dtype: int64

In [79]:
b

Unnamed: 0,authorized_flag,city_id,category_1,installments,category_3,merchant_category_id,month_lag,purchase_amount,category_2,state_id,subsector_id,card_id,merchant_id,purchase_date
0,1,88,0,0,0.0,80,-8,-0.703331,1.0,16,37,C_ID_4e6213e9bc,M_ID_e020e9b302,2017-06-25 15:33:07
1,1,88,0,0,0.0,367,-7,-0.733128,1.0,16,16,C_ID_4e6213e9bc,M_ID_86ec983688,2017-07-15 12:10:45
2,1,88,0,0,0.0,80,-6,-0.720386,1.0,16,37,C_ID_4e6213e9bc,M_ID_979ed661fc,2017-08-09 22:04:29
3,1,88,0,0,0.0,560,-5,-0.735352,1.0,16,34,C_ID_4e6213e9bc,M_ID_e6d5ae8ea6,2017-09-02 10:06:26
4,1,88,0,0,0.0,80,-11,-0.722865,1.0,16,37,C_ID_4e6213e9bc,M_ID_e020e9b302,2017-03-10 01:14:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29112356,1,-1,1,1,1.0,360,-1,-0.632706,1.0,-1,34,C_ID_2863d2fa95,M_ID_edd92b6720,2017-01-20 08:52:04
29112357,1,-1,1,1,1.0,360,0,-0.632706,1.0,-1,34,C_ID_2863d2fa95,M_ID_edd92b6720,2017-02-20 04:40:50
29112358,1,3,0,0,0.0,278,0,-0.657740,1.0,16,37,C_ID_5c240d6e3c,M_ID_9cdcfe8673,2017-12-26 18:37:51
29112359,1,331,0,0,0.0,514,-1,0.770620,1.0,16,9,C_ID_5c240d6e3c,M_ID_1a75f94f92,2017-11-24 14:18:15


In [80]:
b.isna().sum()

authorized_flag         0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
month_lag               0
purchase_amount         0
category_2              0
state_id                0
subsector_id            0
card_id                 0
merchant_id             0
purchase_date           0
dtype: int64

In [81]:
c

Unnamed: 0,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2,merchant_id
0,8353,792,9,-0.057465,-0.057465,1,4,4,-0.400000,9.664062,3,-2.250000,18.671875,6,-2.320000,13.914062,12,1,242,9,1.0,M_ID_838061e48c
1,3184,840,20,-0.057465,-0.057465,1,4,4,-0.720000,1.750000,3,-0.740000,1.291992,6,-0.570000,1.687500,12,1,22,16,1.0,M_ID_9339d880ad
2,447,690,1,-0.057465,-0.057465,1,4,4,-82.129997,260.000000,2,-82.129997,260.000000,2,-82.129997,260.000000,2,1,-1,5,5.0,M_ID_e726bbae1e
3,5026,792,9,-0.057465,-0.057465,0,4,4,1.244000,1.666992,3,2.660000,4.667969,6,5.232000,3.833984,12,0,-1,-1,1.0,M_ID_a70e9c5f81
4,2228,222,21,-0.057465,-0.057465,0,4,4,4.280000,0.500000,3,3.308000,0.361084,6,3.466000,0.347168,12,0,-1,-1,1.0,M_ID_64456c37ce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334691,1145,705,33,3.173828,-0.047546,1,0,0,1.000000,1.022461,3,0.990000,1.019531,6,1.000000,1.024414,12,0,69,9,1.0,M_ID_1f4773aa76
334692,35,544,29,-0.057465,-0.057465,0,0,0,0.890000,0.927734,3,0.780000,0.813477,6,0.590000,0.606934,12,0,-1,-1,1.0,M_ID_725a60d404
334693,35,561,7,-0.057465,-0.057465,1,0,0,0.960000,0.982910,3,0.900000,0.924805,6,0.740000,0.750977,8,0,160,21,5.0,M_ID_f2045dd267
334694,35,511,7,-0.057465,-0.057465,0,0,0,0.940000,0.919434,3,0.820000,0.783203,6,0.650000,0.583984,12,0,-1,-1,1.0,M_ID_9139332ccc


In [82]:
c.isna().sum()

merchant_group_id              0
merchant_category_id           0
subsector_id                   0
numerical_1                    0
numerical_2                    0
category_1                     0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_lag12            0
active_months_lag12            0
category_4                     0
city_id                        0
state_id                       0
category_2                     0
merchant_id                    0
dtype: int64

#References:
1. https://heartbeat.fritz.ai/data-handling-scenarios-part-2-working-with-missing-values-in-a-dataset-34b758cfc9fa
2. https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
