<a href="https://colab.research.google.com/github/human-ai2025/Elo-Merchant-Recommendation/blob/master/2_merging_data_elo_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import Libraries 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import gc
from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
import datetime
import time
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import mode

In [2]:
#Mounting drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Setting up worksapce directory 
%cd /content/drive/MyDrive/data 

/content/drive/MyDrive/data


In [69]:
#refer:-https://www.youtube.com/watch?v=vOMtQ4ocMGI

@contextmanager
def timer(title):
    """ used to calculate time for each function"""
    t0 = time.time()
    yield
    print("{} - done in {:.000f}s".format(title, time.time() - t0))

In [70]:
#https://www.kaggle.com/fabiendaniel/elo-world
#Function to load data into pandas and reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    #loop for alll the columns in the dataframe 
    for col in df.columns:
        #get the datatype of the column
        col_type = df[col].dtypes
        #if the data type is numeric then only start changing the datatype
        #as it isnt much helpful for other data types 
        if col_type in numerics:
            #stores the min value of the column 
            c_min = df[col].min()
            #stores the maximum value of the column
            c_max = df[col].max()
            #for int type numerics
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            #for float type numerics 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [71]:
#Refer:-https://stackoverflow.com/questions/36631163/what-are-the-pros-and-cons-between-get-dummies-pandas-and-onehotencoder-sciki
def one_hot_encoder(df, nan_as_category=True):
    """used to create the one hot encoding of the categorical variables """
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    print("The columns on which one hot encoding is performed is ",categorical_columns)
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [72]:
def train_test(num_rows=None):

    # load csv
    train_df = pd.read_csv('/content/drive/MyDrive/data/train.csv', index_col=['card_id'], nrows=num_rows)
    test_df = pd.read_csv('/content/drive/MyDrive/data/test.csv', index_col=['card_id'], nrows=num_rows)

    print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))
    OUTLIER_THRESHOLD = 30
    # Create an outliers column set to 1 for
    train_df['outliers'] = np.where(train_df['target'] < OUTLIER_THRESHOLD, 1, 0)

    # set target as nan
    test_df['target'] = np.nan

    # merge
    df = train_df.append(test_df)

    del train_df, test_df
    gc.collect()

    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['quarter_first_active_month'] = df['first_active_month'].dt.quarter
    df['first_active_month_diff_from_today'] = (datetime.datetime.today() - df['first_active_month']).dt.days


    # one hot encoding
    df, cols = one_hot_encoder(df, nan_as_category=False)

    for col in ['feature_1', 'feature_2', 'feature_3']:
        order_label = df.groupby(col)['outliers'].mean()
        df[col] = df[col].map(order_label)

    # Some basic statistics transformations over the feature_i columns
    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum'] / 3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_std'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)

    return df

In [198]:
def transactions_imputations(df):

    # fillna
    df['category_2'].fillna(6, inplace=True)
    df['category_3'].fillna(3, inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
    df['installments'].replace(-1, np.nan, inplace=True)
    df['installments'].replace(999, np.nan, inplace=True)
    df['installments'].fillna(df['installments'].mode()[0], inplace=True)



    # mapping categorical to numerical 
    df['authorized_flag'] = df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_1'] = df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    df['category_3'] = df['category_3'].map({'A': 0, 'B': 1,'C': 2, 3:3}).astype(int)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)
    #removing purchase amount outliner    
    df['purchase_amount'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    df['price'] = df['purchase_amount'] / (df['installments'] + 0.001) #some epsilone for 0  installments 

    # reduce memory usage
    #df = reduce_mem_usage(df)

    return df


In [199]:
def merchant_imputations(num_rows=None):
    # load csv
    df = pd.read_csv('/content/drive/MyDrive/data/merchants.csv', nrows=num_rows)

    #drop duplicate merchant id
    df.drop_duplicates(subset=['merchant_id'], keep='last')

    df['category_1'] = df['category_1'].fillna(2)
    df['category_1'] = df['category_1'].map({'Y': 0, 'N': 1,2:2}).astype(int)

    #for most_recent_sales_range_merchants_t_merchants_t and most_recent_purchases_range_merchants_t_merchants_t
    #it has A,B,C,D,E and null values 
    df['most_recent_sales_range'] = df['most_recent_sales_range'].fillna(5)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].fillna(5)
    df['most_recent_sales_range'] = df['most_recent_sales_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    df['most_recent_purchases_range'] = df['most_recent_purchases_range'].map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E' : 4, 5 : 5}).astype(int)
    

    #for category_4_merchants_t_merchants_t
    #it has Y,N and null values 
    df['category_4'] = df['category_4'].fillna(2)
    df['category_4'] = df['category_4'].map({'Y': 0, 'N': 1, 2 : 2}).astype(int)

    df['category_2'] = df['category_2'].fillna(6)
    df['category_2'] = df['category_2'].map({1.0 : 1, 2.0 : 2, 3.0 : 3,4.0 : 4, 5.0 : 5, 6 : 6}).astype(int)

    #for missing id we will use -1111 as filling value 
    #find the number of missing values 
    for col in df[['merchant_group_id','state_id',
                            'merchant_category_id','subsector_id','city_id']]:
                            df[col] = df[col].fillna(-1111)

    #imputing the inf values with max 
    features_inf = ["avg_purchases_lag3","avg_purchases_lag6","avg_purchases_lag12"]
    for col in features_inf:
        df.loc[df[col]==np.inf,col] = max(df.loc[df[col]!=np.inf,col])

    df.columns = [col+"_merchants_t" if col!="merchant_id" else col for col in df.columns]

    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

In [200]:
def group_on_card_id_with_mer(df):
    """
    FUNCTION:
    To group the data on card id  

    ARGS:
    df is the data frame on which grouping needs to be performed 

    RETURNS:
    returns the data frame after grouping 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)

    #for id 
    col_id = ['city_id','merchant_id','state_id',
                            'subsector_id','merchant_group_id_merchants_t','state_id_merchants_t',
                            'merchant_category_id_merchants_t','subsector_id_merchants_t','city_id_merchants_t']
    #for numerical 
    col_numerical = ['month_lag', 'purchase_amount', 'month', 'hour', 'weekofyear', 'weekday', 'day',
                      'numerical_2_merchants_t','avg_sales_lag3_merchants_t',
                      'avg_purchases_lag3_merchants_t', 'active_months_lag3_merchants_t',
                      'avg_sales_lag6_merchants_t', 'avg_purchases_lag6_merchants_t',
                      'active_months_lag6_merchants_t', 'avg_sales_lag12_merchants_t',
                      'avg_purchases_lag12_merchants_t', 'active_months_lag12_merchants_t']

    aggs = {}
    for col in col_id:
        aggs[col] = ['nunique']

    for col in col_numerical:
        aggs[col] = ['nunique', 'mean', 'min', 'max']

    aggs['purchase_amount'] = ['sum', 'max', 'min', 'mean']
    aggs['installments'] = ['sum', 'max', 'mean']
    aggs['purchase_date'] = ['max', 'min']
    aggs['month_lag'] = ['max', 'min', 'mean']
    aggs['authorized_flag'] = ['mean']
    aggs['category_1'] = ['mean']
    aggs['category_2'] = ['mean']
    aggs['category_3'] = ['mean']
    aggs['category_1_merchants_t'] = ['mean']
    aggs['category_2_merchants_t'] = ['mean']
    aggs['category_4_merchants_t'] = ['mean']
    aggs['most_recent_sales_range_merchants_t'] = ['mean']
    aggs['most_recent_purchases_range_merchants_t'] = ['mean']

    df = df.reset_index().groupby('card_id').agg(aggs)

    # change column name
    df.columns = pd.Index([e[0] + "_" + e[1] for e in df.columns.tolist()])
    df.columns = ['new_' + c for c in df.columns]
    
    # reduce memory usage
    df = reduce_mem_usage(df)

    return df

In [246]:
def group_on_card_id(df):
    """
    FUNCTION:
    To group the data on card id  

    ARGS: 
    df is the data frame on which grouping needs to be performed 

    RETURNS:
    returns the data frame after grouping 
    
    """

    #Some feature engineering on date and time  
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['month'] = df['purchase_date'].dt.month
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['weekday'] = df['purchase_date'].dt.weekday
    df['weekend'] = (df['purchase_date'].dt.weekday >= 5).astype(int)

    mode = lambda x: x.value_counts().index[0]
    range2 = lambda x: np.nanmax(x)-np.nanmin(x)

    date_range = lambda x: (x.max()-x.min())/np.timedelta64(1,'D')
    days_to_next_purchase_mean = lambda x: np.mean([(np.sort(x)[i+1]-np.sort(x)[i])/np.timedelta64(1,'D') \
                                                    for i in range(x.shape[0]-1)])
    days_to_next_purchase_std = lambda x: np.std([(np.sort(x)[i+1]-np.sort(x)[i])/np.timedelta64(1,'D') \
                                                  for i in range(x.shape[0]-1)])

    agg_fun = {"authorized_flag": ['sum', 'mean',mode],
    'category_1' : ['sum', 'mean',mode],
    'category_2' :['sum', 'mean',mode],
    'category_3' :['sum', 'mean',mode],
    'city_id' : ['nunique',mode],
    'state_id' : ['nunique',mode],
    'subsector_id' : ['nunique',mode],
    'merchant_category_id' : ['nunique',mode],
    'merchant_id': ['nunique',mode],
    'month_lag' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'installments' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'purchase_amount' : ['sum', 'mean', 'min', 'max', 'std','var'],
    'price': ['sum', 'mean', 'min', 'max', 'var', 'skew'],

    'weekend': ['sum', 'mean'],
    'weekday' : ['nunique', 'sum', 'mean'],
    'hour': ['nunique', 'mean', 'min', 'max'],
    'weekofyear': ['nunique', 'mean', 'min', 'max'],
    'day': ['nunique', 'sum', 'min'],
    #Refer:-https://numpy.org/doc/stable/reference/generated/numpy.ptp.html
    'purchase_date' : [np.ptp, 'min', 'max'],
    'month' : ['sum', 'mean', 'nunique']
    }
    df = df.groupby("card_id",as_index=False).agg(agg_fun)


    # change column name
    df.columns = pd.Index([e[0] + "_" + e[1]  for e in df.columns.tolist()])
    df.columns = ['df_' + c for c in df.columns]
    
    # reduce memory usage
    #df = reduce_mem_usage(df)
  

    return df

In [256]:
def generate_augmented_train_test_1(debug=False):
    """ Generate train and test augmented datasets. """
    num_rows = 100 if debug else None
    with timer("train & test"):
        df = train_test(num_rows).reset_index()
    with timer("transactions"):
        # load csv
        new_merchant_df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=num_rows)
        historical_transactions_df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=num_rows)
        df_1 = pd.concat([historical_transactions_df, new_merchant_df], ignore_index=True)
        df_1 = transactions_imputations(df_1)
        df = pd.merge(df, df_1, right_on='card_id',left_on='df_card_id_', how='left')
        del new_merchant_df
        del historical_transactions_df
        gc.collect()

    with timer("split train & test"):
        train_df = df[df['target'].notnull()]
        test_df = df[df['target'].isnull()]
        del test_df['target']
        del df
        gc.collect()
    with timer("Save train and test files"):

        train_df.to_csv('/content/drive/MyDrive/data/augmented_train.csv', index=False)
        test_df.to_csv('/content/drive/MyDrive/data/augmented_test.csv', index=False)

In [265]:
def generate_augmented_train_test(debug=False):
    """ Generate train and test augmented datasets. """
    num_rows = 100 if debug else None
    with timer("transactions"):
        # load csv
        new_merchant_df = pd.read_csv('/content/drive/MyDrive/data/new_merchant_transactions.csv', nrows=num_rows)
        historical_transactions_df = pd.read_csv('/content/drive/MyDrive/data/historical_transactions.csv', nrows=num_rows)
        df = pd.concat([historical_transactions_df, new_merchant_df], ignore_index=True)
        del new_merchant_df
        del historical_transactions_df
        df = transactions_imputations(df)
        gc.collect()
    #with timer("merchants"):
    #    merchants_df = merchant_imputations(num_rows).reset_index()
    #    print(merchants_df.columns)
    #    print(df.columns)
    #    df = pd.merge(df, merchants_df, on='merchant_id', how='left')
    with timer("group_on_card_id"):
        df = group_on_card_id(df)
    with timer("train & test"):
        tt = train_test(num_rows).reset_index()
        print(tt.columns)
        print(df.columns)
        print(df.isnull().sum(axis = 0))
        df = pd.merge(df, tt, right_on='card_id',left_on='df_card_id_', how='right')
    with timer("split train & test"):
        train_df = df[df['target'].notnull()]
        test_df = df[df['target'].isnull()]
        del test_df['target']
        del df
        gc.collect()
    with timer("Save train and test files"):
        train_df.to_csv('/content/drive/MyDrive/data/augmented_train.csv', index=False)
        test_df.to_csv('/content/drive/MyDrive/data/augmented_test.csv', index=False)


In [266]:
if __name__ == "__main__":
  generate_augmented_train_test(True)

transactions - done in 0s
group_on_card_id - done in 0s
Train samples: 100, test samples: 100
The columns on which one hot encoding is performed is  []
Index(['card_id', 'first_active_month', 'feature_1', 'feature_2', 'feature_3',
       'target', 'outliers', 'quarter', 'elapsed_time', 'feature_sum',
       'feature_mean', 'feature_max', 'feature_min', 'feature_std'],
      dtype='object')
Index(['df_card_id_', 'df_authorized_flag_sum', 'df_authorized_flag_mean',
       'df_authorized_flag_<lambda_0>', 'df_category_1_sum',
       'df_category_1_mean', 'df_category_1_<lambda_0>', 'df_category_2_sum',
       'df_category_2_mean', 'df_category_2_<lambda_0>', 'df_category_3_sum',
       'df_category_3_mean', 'df_category_3_<lambda_0>', 'df_city_id_nunique',
       'df_city_id_<lambda_0>', 'df_state_id_nunique',
       'df_state_id_<lambda_0>', 'df_subsector_id_nunique',
       'df_subsector_id_<lambda_0>', 'df_merchant_category_id_nunique',
       'df_merchant_category_id_<lambda_0>', 'df_

In [267]:
train_df = pd.read_csv('/content/drive/MyDrive/data/augmented_train.csv', nrows=200)

In [268]:
train_df.isnull().sum(axis = 0)

df_card_id_                      100
df_authorized_flag_sum           100
df_authorized_flag_mean          100
df_authorized_flag_<lambda_0>    100
df_category_1_sum                100
                                ... 
feature_sum                        0
feature_mean                       0
feature_max                        0
feature_min                        0
feature_std                        0
Length: 83, dtype: int64

In [269]:
train_df

Unnamed: 0,df_card_id_,df_authorized_flag_sum,df_authorized_flag_mean,df_authorized_flag_<lambda_0>,df_category_1_sum,df_category_1_mean,df_category_1_<lambda_0>,df_category_2_sum,df_category_2_mean,df_category_2_<lambda_0>,df_category_3_sum,df_category_3_mean,df_category_3_<lambda_0>,df_city_id_nunique,df_city_id_<lambda_0>,df_state_id_nunique,df_state_id_<lambda_0>,df_subsector_id_nunique,df_subsector_id_<lambda_0>,df_merchant_category_id_nunique,df_merchant_category_id_<lambda_0>,df_merchant_id_nunique,df_merchant_id_<lambda_0>,df_month_lag_sum,df_month_lag_mean,df_month_lag_min,df_month_lag_max,df_month_lag_std,df_month_lag_var,df_installments_sum,df_installments_mean,df_installments_min,df_installments_max,df_installments_std,df_installments_var,df_purchase_amount_sum,df_purchase_amount_mean,df_purchase_amount_min,df_purchase_amount_max,df_purchase_amount_std,...,df_price_min,df_price_max,df_price_var,df_price_skew,df_weekend_sum,df_weekend_mean,df_weekday_nunique,df_weekday_sum,df_weekday_mean,df_hour_nunique,df_hour_mean,df_hour_min,df_hour_max,df_weekofyear_nunique,df_weekofyear_mean,df_weekofyear_min,df_weekofyear_max,df_day_nunique,df_day_sum,df_day_min,df_purchase_date_ptp,df_purchase_date_min,df_purchase_date_max,df_month_sum,df_month_mean,df_month_nunique,card_id,first_active_month,feature_1,feature_2,feature_3,target,outliers,quarter,elapsed_time,feature_sum,feature_mean,feature_max,feature_min,feature_std
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_92a2005557,2017-06-01,1.0,1.0,1.0,-0.820283,1.0,2,1291,3.0,1.0,1.0,1.0,0.0
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_3d0044924f,2017-01-01,1.0,1.0,1.0,0.392913,1.0,1,1442,3.0,1.0,1.0,1.0,0.0
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_d639edf6cd,2016-08-01,1.0,1.0,1.0,0.688056,1.0,3,1595,3.0,1.0,1.0,1.0,0.0
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_186d6a6901,2017-09-01,1.0,1.0,1.0,0.142495,1.0,3,1199,3.0,1.0,1.0,1.0,0.0
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_cdbd2c0db2,2017-11-01,1.0,1.0,1.0,-0.159749,1.0,4,1138,3.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_d9388844f8,2016-08-01,1.0,1.0,1.0,-0.693928,1.0,3,1595,3.0,1.0,1.0,1.0,0.0
96,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_24b7e16c4f,2017-01-01,1.0,1.0,1.0,-2.788379,1.0,1,1442,3.0,1.0,1.0,1.0,0.0
97,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_a8ad576b32,2017-03-01,1.0,1.0,1.0,-2.939943,1.0,1,1383,3.0,1.0,1.0,1.0,0.0
98,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,C_ID_26364d47bb,2014-08-01,1.0,1.0,1.0,-5.288426,1.0,3,2326,3.0,1.0,1.0,1.0,0.0


In [270]:
#Gives the name of all the variables with missing data 

columns_with_na = [var for var in train_df.columns if train_df[var].isnull().mean()  > 0]

In [271]:
len(columns_with_na)

69

In [254]:
#lets find out the percentage of observations missing per variable

#calculate the percentage of missing 
data_na = train_df[columns_with_na].isnull().mean()

#transform the array to dataframe 
data_na = pd.DataFrame(data_na.reset_index())

#add names to the dataframe 
data_na.columns = ['col','percentage_na']

#oreder the dataframe acc to percentage 
data_na.sort_values(by = 'percentage_na',ascending = False, inplace = True)

#show
data_na

Unnamed: 0,col,percentage_na
0,df_card_id_,1.0
44,df_price_max,1.0
50,df_weekday_sum,1.0
49,df_weekday_nunique,1.0
48,df_weekend_mean,1.0
...,...,...
29,df_installments_sum,1.0
30,df_installments_mean,1.0
31,df_installments_min,1.0
32,df_installments_max,1.0


In [None]:
def lgb_train_fn(train_df, target, trn_cols,  n_fold):
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=4590)
    # initialise out of fold preds to 0s.
    oof = np.zeros(len(train_df))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df,train_df['outliers'].values)):
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][trn_cols], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][trn_cols], label=target.iloc[val_idx])

        num_round = 10000
        clf = lgb.train(lgb_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=0, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][trn_cols], num_iteration=clf.best_iteration)

    print(np.sqrt(mean_squared_error(oof, target)), 'CV score')
    return np.sqrt(mean_squared_error(oof, target))

In [None]:
for c in cols_to_add:
    lgb_cols = final_cols + [c]
    print(len(lgb_cols), 'lg_cols', c)
    score = lgb_train_fn(x, y, lgb_cols, 5)
    delta = base_score - score
    fe_d[c] = delta
    if delta > 0:
        base_score = score
        selected_cols.append(c)
        print('Selected cols', c)
        print('Selected col delta', delta)
        print(' score with col', score)
        np.save('selecte_cols_extra', selected_cols)
        final_cols = final_cols + [c]