<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/preprocessing/Different_feature_joining_modified_3_using_model_imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [23]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder

### Mount the drive 

In [4]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Setup the current path 

In [5]:
!ls

drive  sample_data


In [6]:
#Setting up worksapce directory 
%cd /content/drive/MyDrive/data 

/content/drive/MyDrive/data


In [7]:
!ls

 augmented_test.csv
 augmented_test_try1.csv
 augmented_train.csv
 augmented_train_try1.csv
 clf_cat2.sav
 clf_cat3.sav
'Data Dictionary.xlsx'
 Data_Dictionary.xlsx
 historical_transactions.csv
 historical_transactions_df.pkl
 merchant_category_1_enc.npy
 merchant_category_4_enc.npy
 merchant_most_recent_purchases_range_enc.npy
 merchant_most_recent_sales_range_enc.npy
 merchants.csv
 merged_transactions_with_merchants.csv
 new_merchant_authorized_flag_enc.npy
 new_merchant_category_1_enc.npy
 new_merchant_df.pkl
 new_merchant_transactions.csv
 sample_submission.csv
 test.csv
 train.csv
 transactions_raw_merged.csv
 transactions_refined_1_merged_WON.csv


## Timer 

In [8]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI
@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

## Memory Reduction 

In [9]:
#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## One hot encoder 

In [10]:
def lab_enc(df, cols, prefix=''):
  '''
  categorical to numerical 
  df   - Original DataFrame
  cols - label encode the specified columns
  #https://towardsdatascience.com/choosing-the-right-encoding-method-label-vs-onehot-encoder-a4434493149b#:~:text=Label%20Encoder%3A&text=LabelEncoder%20encode%20labels%20with%20a,value%20to%20as%20assigned%20earlier.&text=The%20categorical%20values%20have%20been,all%20label%20encoding%20is%20about.
  '''
  
  lbl_enc = LabelEncoder()
  for col in cols:
    df[col] = lbl_enc.fit_transform(df[col].astype(str))
    #https://numpy.org/doc/stable/reference/generated/numpy.save.html
  return df

## Imputations on transactions data 

In [52]:
def transactions_imputations(df_name='hist',nrows = None):

    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT HANDLES MISSING VALUES BY MODEL IMPUTATIONS 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL 

    ARGS:
          DF:- THE DATAFRAME THAT NEEDS TO HAVE IMPUTATIONS 
          DEFAULT IS HISTORICAL TRANSACTIONS 
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE

    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 

    REFER:-https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
    """

    import pickle
    from sklearn.linear_model import LogisticRegression


    if df_name == 'hist':
      df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=nrows)
    else:
      df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=nrows)
    

    # reduce memory usage
    df = reduce_mem_usage(df)

    print(df.isna().sum())

    #category 2 and 3 have missing values and we will handle them via model based impuation 

    #creating a dataframe 
    temp = pd.DataFrame()

    #get the card id
    temp['card_id'] = df['card_id']

    #get the merchant id
    temp['merchant_id'] = df['merchant_id']

    #get the purchase date
    temp['purchase_date'] = df['purchase_date']

    #drop the columns from the new merchant data frame 
    df.drop(['card_id', 'merchant_id', 'purchase_date'], axis=1, inplace=True)

    gc.collect()

    #store all the columns as features 
    feat = df.columns

    #having missing values 
    cols = ['category_2', 'category_3']

    #label encode the variables
    df = lab_enc(df, ['authorized_flag','category_1'], prefix=df_name)
    print(df.sample(10))

    #list to hold the null values
    list_no_nan = []

    #select only columns which doesn't have any null values
    for c in feat:
      if c not in cols:
        list_no_nan.append(c)

    #create a test set by selecting only rows which are having null values
    #we will predict the values of categority 2
    test = df[df['category_2'].isna()]

    #create train set by selecting rows which doesn't have any null values
    train = df.dropna()

    d = {'A':1, 'B':2, 'C':3}
    train['category_3'] = train['category_3'].map(d)
    test['category_3'] = test['category_3'].map(d)

    #fit the classifier to the train data
    clf_name = df_name + 'LR1'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()
    clf_name.fit(train[list_no_nan], train['category_2'])
    #save the model in pickel 
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 2 values ....")
    #make prediction only for the rows with null value
    df.loc[df['category_2'].isna(), 'category_2'] = clf_name.predict(test[list_no_nan])

    test = df[df['category_3'].isna()]
    train = df.dropna()

    clf_name = df_name + 'LR2'
    print("[INFO] NAME OF CLF ",clf_name )
    clf_name = LogisticRegression()
    clf_name.fit(train[list_no_nan], train['category_3'])
    #save the model in pickel
    pickle.dump(clf_name, open('clf_name.sav', 'wb'))

    print("[INFO] Imputing Categorical 3 values ....")
    df.loc[df['category_3'].isna(), 'category_3'] = clf_name.predict(test[list_no_nan])

    df['card_id'] = temp['card_id']
    df['merchant_id'] = temp['merchant_id']
    df['purchase_date'] = temp['purchase_date']

    print(df.isna().sum())
    print("[INFO] SAVING ...")
    file_name = df_name
    df.to_csv('file_name.csv', index=False)


    return df


## Imputations on Merchant Data

In [53]:
def merchant_imputations(num_rows=None):
    """
    FUNCTION:
          PERFORMS THE IMPUTATIONS FOR THE TRANSACTIONS DATA 
          IT HANDLES MISSING VALUES 
          IT DOES MAPPING OF CATEGORICAL DATA TO NUMERICAL
          IT HANDLES THE INF VALUES  

    ARGS:
          NUM_ROWS: TO LOAD THE NUMBER OF ROWS FOR THE DATAFRAME(DEBUGGING)
          DEFAULT IS NONE
    
    RETURNS:
          THE MODIFIED DATA FRAME WITH IMPUTATIONS 

    REFER:-https://medium.com/towards-artificial-intelligence/handling-missing-data-for-advanced-machine-learning-b6eb89050357
    """

    import pickle
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsRegressor

    df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)
    df = reduce_mem_usage(df)

    #get all the ones not inf
    #suprisingly the 3 values have the nan and inf
    df = df[df['avg_purchases_lag3']!=np.inf]

    #creating a temporary dataframe 
    temp = pd.DataFrame()
    temp['merchant_id'] = df['merchant_id']
    # we use KNN for numerical and logisticregrssion for categorical 
    temp['category_2'] = df['category_2']

    #category 2 is needed for 2nd round for logistic regression
    # merchant id is not useful
    df.drop(['merchant_id', 'category_2'], axis=1, inplace=True)

    #categorical to numerical 
    df = lab_enc(df, ['category_4','category_1','most_recent_sales_range','most_recent_purchases_range'], prefix='merchant')

    features = df.columns
    #sales have missing values 
    cols = ['avg_sales_lag3','avg_sales_lag6','avg_sales_lag12']
    list_no_nan = []

    #get the features not to predict and having no nan 
    for c in features:
      if c not in cols:
        list_no_nan.append(c)

    #have all nan as we need to predict this (avg_sales_lag3)
    test = df[df['avg_sales_lag3'].isna()]
    #have no nan to train 
    train = df.dropna()

    #initilize for knn 
    knn_sal_3 = KNeighborsRegressor(n_neighbors=5)

    #fit on data 
    knn_sal_3.fit(train[list_no_nan], train['avg_sales_lag3'])
    #fill the missing values 
    df.loc[df['avg_sales_lag3'].isna(), 'avg_sales_lag3'] = knn_sal_3.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_3, open('knn_sal_3.sav', 'wb'))

    #have all nan as we need to predict this (avg_sales_lag6)
    test = df[df['avg_sales_lag6'].isna()]
    train = df.dropna()

    #initilize the model
    knn_sal_6 = KNeighborsRegressor(n_neighbors=5)
    #fit the model 
    knn_sal_6.fit(train[list_no_nan], train['avg_sales_lag6'])
    #fill the missing values 
    df.loc[df['avg_sales_lag6'].isna(), 'avg_sales_lag6'] = knn_sal_6.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_6, open('knn_sal_6.sav', 'wb'))

    #have all nan as we need to predict this (avg_sales_lag12)
    test = df[df['avg_sales_lag12'].isna()]
    train = df.dropna()

    #initilize the model
    knn_sal_12 = KNeighborsRegressor(n_neighbors=5)
    #fit on data 
    knn_sal_12.fit(train[list_no_nan], train['avg_sales_lag12'])
    #fill the missing values 
    df.loc[df['avg_sales_lag12'].isna(), 'avg_sales_lag12'] = knn_sal_12.predict(test[list_no_nan])
    #save the model 
    pickle.dump(knn_sal_12, open('knn_sal_12.sav', 'wb'))

    #for category 2
    df['category_2'] = temp['category_2']

    feat = df.columns
    cols = ['category_2']
    list_no_nan = []

    #get the features not to predict and having no nan 
    for c in feat:
      if c not in cols:
        list_no_nan.append(c)

    #get all the missing values 
    test = df[df['category_2'].isna()]
    # have all the good values 
    train = df.dropna()

    LR_cat_2 = LogisticRegression()
    LR_cat_2.fit(train[list_no_nan], train['category_2'])
    df.loc[df['category_2'].isna(), 'category_2'] = LR_cat_2.predict(test[list_no_nan])
    pickle.dump(LR_cat_2, open('LR_cat_2.sav', 'wb'))

    df['merchant_id'] = temp['merchant_id']

    return df

## Generating the aurgumentated dataset 

In [64]:
def imputations_mer_trans(debug=False):

    """ 
    FUNCTION:
      Do model based imputation 
      STEPS:-
      1. Load and Impute Historical Transactions
      2. Load and Impute New Merchants Transactions
      3. Load and Impute  Merchants   

    ARGS:
     Debug:- Takes care of numer of rows as we need faster manipulation of data in dubigging 

    RETURNS:
      Generates the aurgumented train and test 

    """

    num_rows = 10000 if debug else None
    with timer("load and impute transactions ...."):
        # load AND IMPUTE 
        print("[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....")
        historical_transactions_df = transactions_imputations(df_name='hist',nrows = num_rows)
        print("[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....")
        new_merchant_df = transactions_imputations(df_name='mer',nrows = num_rows)

        #unique values of column of card id 
        print("[INFO] THE UNIQUE VALUES OF CARD ID ...")
        print("FOR NEW MERCHANT ",new_merchant_df.card_id.nunique())
        print("FOR HISTORICAL TRANSACTIONS ",historical_transactions_df.card_id.nunique())
        print("[INFO] Check for Nan")
        print(historical_transactions_df.isna().sum())
        print(historical_transactions_df.isna().sum())

    with timer("load and impute merchants ...."):
        #load csv 
        print("[INFO] LOADING MERCHANTS AND IMPUTATIONS  ....")
        merchant_df = merchant_imputations(num_rows=num_rows)

        print("[INFO] Check for Nan ...")
        print(merchant_df.isna().sum())    

        return new_merchant_df, historical_transactions_df, merchant_df

## Main Method

In [66]:
if __name__ == "__main__":
  new_merchant_df,historical_transactions_df,merchant_df = imputations_mer_trans(debug=False)

[INFO] LOADING AND IMPUTING HISTORICAL TRANSACTIONS ....
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
authorized_flag               0
card_id                       0
city_id                       0
category_1                    0
installments                  0
category_3               178159
merchant_category_id          0
merchant_id              138481
month_lag                     0
purchase_amount               0
purchase_date                 0
category_2              2652864
state_id                      0
subsector_id                  0
dtype: int64
          authorized_flag  city_id  ...  state_id  subsector_id
2163503                 1       20  ...        19            37
12951525                1      291  ...         9            27
9215813                 1       69  ...         9            34
15600977                1      158  ...        15            27
2312213                 1      313  ...         5            27
1712986                 1       69  ...      

In [67]:
new_merchant_df.isna().sum()

authorized_flag             0
city_id                     0
category_1                  0
installments                0
category_3                  0
merchant_category_id        0
month_lag                   0
purchase_amount             0
category_2                  0
state_id                    0
subsector_id                0
card_id                     0
merchant_id             26216
purchase_date               0
dtype: int64

In [69]:
historical_transactions_df.isna().sum()

authorized_flag              0
city_id                      0
category_1                   0
installments                 0
category_3                   0
merchant_category_id         0
month_lag                    0
purchase_amount              0
category_2                   0
state_id                     0
subsector_id                 0
card_id                      0
merchant_id             138481
purchase_date                0
dtype: int64

In [70]:
merchant_df.isna().sum()

merchant_group_id              0
merchant_category_id           0
subsector_id                   0
numerical_1                    0
numerical_2                    0
category_1                     0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_lag12            0
active_months_lag12            0
category_4                     0
city_id                        0
state_id                       0
category_2                     0
merchant_id                    0
dtype: int64