<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/Preprocessing%20and%20Feature%20Engineering/Preprocessing_MeanMedian.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the libraries

In [1]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode

## Mounting Drive

In [None]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

### Set up the path

In [3]:
%cd /content/drive/MyDrive/data

/content/drive/MyDrive/data


### View Current Items in the folder

In [None]:
!ls

## Helper Functions

### Timer

In [5]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI
@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

### Memory Reduction

In [6]:

#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Imputation 

### Problem Summary 
1. There are two main datasets that contain the card id and the target variable loyality score to predict - Train and Test
2. There are two datasets that contain informations about all the transactions buying from different merchants - Historical Transations and New merchants transactions. 
3. There is 1 dataset that contains informations about all the Merchants - Merchant 

### Train and Test
1. From Eda we saw that there is no null value in train or test so no imputation required 


### New Merchants Transactions Data
1. In New Merchant Transactions we saw that there are columns with missing values such as category 3, category 2 and merchant id



In [93]:
def transactions_preprocessingNEWMERCHANTS(nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
           
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 
    """

    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=nrows)
    

    # fillna
    df.loc[df['category_2'].isna(),'category_2'] = mode(df['category_2'])
    df.loc[df['category_3'].isna(),'category_3'] = mode(df['category_3'])
    #no id of that merchant id is present in the entire dataset so this will be used for missing value 
    df['merchant_id'].fillna('M_ID_0000000000', inplace=True)


    # mapping categorical to numerical 
    # categorical are:- category_1, category_3, authorized_flag
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2}).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

### Historical Transactions data 
1. In Historical Transactions data we saw that there are columns with missing values such as category 3, category 2 and merchant id

In [88]:
def transactions_preprocessingTRANSACTIONS(nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
          
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 
    """

    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=nrows)

    # fillna
    df.loc[df['category_2'].isna(),'category_2'] = mode(df['category_2'])
    df.loc[df['category_3'].isna(),'category_3'] = mode(df['category_3'])
    #no id of that merchant id is present in the entire dataset so this will be used for missing value
    df['merchant_id'].fillna('M_ID_0000000000', inplace=True)


    # mapping categorical to numerical 
    # categorical are:- category_1, category_3, authorized_flag
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2}).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

### Merchants Data
1. In Merchants data we saw that average_sales_lag3, average_sales_lag6, average_sales_lag12 and category 2 have missing values 

In [105]:
def merchant_preprocessing(num_rows=None):
    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL
          IT HANDLES THE INF VALUES  

    ARGS:
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE
    
    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 
    """
    def mode(series):
        """Most common element in a series"""
        tmode = series.mode()
        if len(tmode) == 0:
            return np.nan
        else:
            return tmode[0]

    df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)

    #drop duplicate merchant id
    df.drop_duplicates(subset=['merchant_id'], keep='last')
    #now every cloum represents one merchant 

    #handling categorical 
    df.loc[df['category_2'].isna(),'category_2'] = mode(df['category_2'])

    #imputing the inf values with max 
    # "avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"
    features_inf = ["avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"]
    for col in features_inf:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])
    
    # imputing the missing values with model 
    #'avg_sales_lag3','avg_sales_lag6','avg_sales_lag12'
    features_missing = ['avg_sales_lag3','avg_sales_lag6','avg_sales_lag12']
    for col in features_missing:
        df[col].fillna((df[col].mean()), inplace=True)


    #mapping categorical to numerical 
    #categorical is :- category_1,most_recent_sales_range, most_recent_purchases_range, active_months_lag12, category_4
    df['category_1'] = df['category_1'].map({'Y': 0, 'N': 1}).astype(int)
    df['most_recent_sales_range'] = df['most_recent_sales_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4}).astype(int)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4}).astype(int)
    df['category_4'] = df['category_4'].map({'Y': 0, 'N': 1, 2 : 2}).astype(int)

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

## Main Function

In [107]:
def main(debug, csvConvert = True):
  if debug is True:
    num_rows = 1000
  else:
    num_rows = None
  with timer('new Merchants'):
    new_merchants = transactions_preprocessingNEWMERCHANTS(num_rows)
  with timer('hist trans'):
    hist_transactions = transactions_preprocessingTRANSACTIONS(num_rows)
  with timer('merchants'):
    merchants = merchant_preprocessing(num_rows)

  if csvConvert == True:
    with timer('To CSV '):
      new_merchants.to_csv('newMerchants_trans_preprocessed.csv', index=False)
      hist_transactions.to_csv('hist_trans_preprocessed.csv', index=False)
      merchants.to_csv('merchants_preprocessed.csv', index=False)

  return new_merchants, hist_transactions, merchants
  

In [108]:
if __name__ == "__main__": 
  a, b, c = main(False)
  

Mem. usage decreased to 74.88 Mb (64.3% reduction)
new Merchants - done in 6s
Mem. usage decreased to 1166.08 Mb (62.5% reduction)
hist trans - done in 81s
Mem. usage decreased to 15.64 Mb (72.2% reduction)
merchants - done in 1s
To CSV  - done in 181s


In [109]:
!ls

 augmented_test.csv
 augmented_test_try1.csv
 augmented_test_try1_model_imputation.csv
 augmented_train.csv
 augmented_train_try1.csv
 augmented_train_try1_model_imputation.csv
 clf_cat2.sav
 clf_cat3.sav
 clf_name.sav
'Data Dictionary.xlsx'
 Data_Dictionary.xlsx
 file_name.csv
 hist_authorized_flag_enc.npy
 hist_category_1_enc.npy
 hist_category_3_enc.npy
 historical_transactions.csv
 historical_transactions_df.pkl
 hist_trans_preprocessed.csv
 knn_sal_12.sav
 knn_sal_3.sav
 knn_sal_6.sav
 LR_cat_2.sav
 mer_authorized_flag_enc.npy
 mer_category_1_enc.npy
 mer_category_3_enc.npy
 merchant_category_1_enc.npy
 merchant_category_4_enc.npy
 merchant_most_recent_purchases_range_enc.npy
 merchant_most_recent_sales_range_enc.npy
 merchants.csv
 merchants_preprocessed.csv
 merged_transactions_with_merchants.csv
 new_merchant_authorized_flag_enc.npy
 new_merchant_category_1_enc.npy
 new_merchant_df.pkl
 newMerchants_trans_preprocessed.csv
 new_merchant_transactions.csv
 sample_submission.csv
 t

In [110]:
a

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,C_ID_415bb3a509,107,0,1,1,307,M_ID_b0c793002c,1,-0.557617,2018-03-11 14:57:36,1.0,9,19
1,1,C_ID_415bb3a509,140,0,1,1,307,M_ID_88920c89e8,1,-0.569336,2018-03-19 18:53:37,1.0,9,19
2,1,C_ID_415bb3a509,330,0,1,1,507,M_ID_ad5237ef6b,2,-0.551270,2018-04-26 14:08:44,1.0,9,14
3,1,C_ID_415bb3a509,-1,1,1,1,661,M_ID_9e84cda3b1,1,-0.671875,2018-03-07 09:43:21,1.0,-1,8
4,1,C_ID_ef55cf8d4b,-1,1,1,1,166,M_ID_3c86fa3831,1,-0.659668,2018-03-22 21:07:53,1.0,-1,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1963026,1,C_ID_1320dee851,142,0,0,0,309,M_ID_7754b67f3b,2,-0.701660,2018-04-06 14:36:52,3.0,19,21
1963027,1,C_ID_f112aa3381,158,0,0,0,560,M_ID_da063195b7,2,-0.694336,2018-03-07 13:19:18,1.0,15,34
1963028,1,C_ID_bd97b86450,69,0,1,1,278,M_ID_9a9ccb6544,1,-0.621094,2018-03-05 12:04:56,1.0,9,37
1963029,1,C_ID_c0513fd84f,130,0,0,0,367,M_ID_40c28d596f,1,-0.656738,2018-03-09 14:47:05,3.0,7,16


In [111]:
a.isna().sum()

authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
category_2              0
state_id                0
subsector_id            0
dtype: int64

In [112]:
b

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,1,C_ID_4e6213e9bc,88,0,0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,1,C_ID_4e6213e9bc,88,0,0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,1,C_ID_4e6213e9bc,88,0,0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,1,C_ID_4e6213e9bc,88,0,0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,1,C_ID_4e6213e9bc,88,0,0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29112356,1,C_ID_2863d2fa95,-1,1,1,1,360,M_ID_edd92b6720,-1,-0.632706,2017-01-20 08:52:04,1.0,-1,34
29112357,1,C_ID_2863d2fa95,-1,1,1,1,360,M_ID_edd92b6720,0,-0.632706,2017-02-20 04:40:50,1.0,-1,34
29112358,1,C_ID_5c240d6e3c,3,0,0,0,278,M_ID_9cdcfe8673,0,-0.657740,2017-12-26 18:37:51,1.0,16,37
29112359,1,C_ID_5c240d6e3c,331,0,0,0,514,M_ID_1a75f94f92,-1,0.770620,2017-11-24 14:18:15,1.0,16,9


In [113]:
b.isna().sum()

authorized_flag         0
card_id                 0
city_id                 0
category_1              0
installments            0
category_3              0
merchant_category_id    0
merchant_id             0
month_lag               0
purchase_amount         0
purchase_date           0
category_2              0
state_id                0
subsector_id            0
dtype: int64

In [114]:
c

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057465,-0.057465,1,4,4,-0.400000,9.664062,3,-2.250000,18.671875,6,-2.320000,13.914062,12,1,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057465,-0.057465,1,4,4,-0.720000,1.750000,3,-0.740000,1.291992,6,-0.570000,1.687500,12,1,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057465,-0.057465,1,4,4,-82.129997,260.000000,2,-82.129997,260.000000,2,-82.129997,260.000000,2,1,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057465,-0.057465,0,4,4,13.832993,1.666992,3,21.650787,4.667969,6,25.227709,3.833984,12,0,-1,-1,1.0
4,M_ID_64456c37ce,2228,222,21,-0.057465,-0.057465,0,4,4,13.832993,0.500000,3,21.650787,0.361084,6,25.227709,0.347168,12,0,-1,-1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334691,M_ID_1f4773aa76,1145,705,33,3.173828,-0.047546,1,0,0,1.000000,1.022461,3,0.990000,1.019531,6,1.000000,1.024414,12,0,69,9,1.0
334692,M_ID_725a60d404,35,544,29,-0.057465,-0.057465,0,0,0,0.890000,0.927734,3,0.780000,0.813477,6,0.590000,0.606934,12,0,-1,-1,1.0
334693,M_ID_f2045dd267,35,561,7,-0.057465,-0.057465,1,0,0,0.960000,0.982910,3,0.900000,0.924805,6,0.740000,0.750977,8,0,160,21,5.0
334694,M_ID_9139332ccc,35,511,7,-0.057465,-0.057465,0,0,0,0.940000,0.919434,3,0.820000,0.783203,6,0.650000,0.583984,12,0,-1,-1,1.0


In [115]:
c.isna().sum()

merchant_id                    0
merchant_group_id              0
merchant_category_id           0
subsector_id                   0
numerical_1                    0
numerical_2                    0
category_1                     0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_lag12            0
active_months_lag12            0
category_4                     0
city_id                        0
state_id                       0
category_2                     0
dtype: int64

#References:
1. https://heartbeat.fritz.ai/data-handling-scenarios-part-2-working-with-missing-values-in-a-dataset-34b758cfc9fa
