In [None]:
import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_columns',30)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict
import random
from sklearn.metrics import f1_score,roc_auc_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
from sklearn.model_selection import StratifiedKFold,GroupKFold
from rfpimp import *
from xgboost import XGBClassifier
import networkx as nx #create and store graph
from node2vec import Node2Vec
from catboost import CatBoostClassifier

In [None]:
with open('train_demo.pkl','rb') as f:
    df_train_demo = pickle.load(f)
with open('train_hist.pkl','rb') as f:
    df_train_hist = pickle.load(f)

In [None]:
with open('test_demo.pkl','rb') as f:
    df_test_demo = pickle.load(f)
with open('test_hist.pkl','rb') as f:
    df_test_hist = pickle.load(f)

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
DisbursalDate=df_train_demo.groupby(['ID'],as_index=False)['DisbursalDate'].min()
train_bureau=df_train_hist.merge(DisbursalDate,on=['ID'],how='left')
train_bureau_updated=train_bureau[train_bureau['DISBURSED-DT'] <= train_bureau['DisbursalDate']]
DisbursalDate=df_test_demo.groupby(['ID'],as_index=False)['DisbursalDate'].min()
test_bureau=df_test_hist.merge(DisbursalDate,on=['ID'],how='left')

In [None]:
train_bureau['train_or_test']='train'
test_bureau['train_or_test']='test'
df_bureau=train_bureau.append(test_bureau)

In [None]:
df_bureau['Time_Difference']=(df_bureau['DISBURSED-DT']-df_bureau['DisbursalDate']).dt.days
df_bureau['Time_Difference']=df_bureau['Time_Difference']/30.71
df_bureau1=df_bureau[df_bureau['Time_Difference']>=12]
df_bureau2=df_bureau[df_bureau['Time_Difference']<=0]

In [None]:
#Train
train_with_future=df_train_demo[df_train_demo['ID'].isin(df_bureau1['ID'].unique())]
train_bureau_with_future=train_bureau[train_bureau['ID'].isin(df_bureau1['ID'].unique())]
train_without_future=df_train_demo.copy()
train_bureau_without_future=train_bureau_updated.copy()

In [None]:
#Test
test_with_future=df_test_demo[df_test_demo['ID'].isin(df_bureau1['ID'].unique())]
test_bureau_with_future=test_bureau[test_bureau['ID'].isin(df_bureau1['ID'].unique())]

test_without_future=df_test_demo[df_test_demo['ID'].isin(df_bureau2['ID'].unique())]
test_bureau_without_future=test_bureau[test_bureau['ID'].isin(df_bureau2['ID'].unique())]
test_without_future=df_test_demo[~df_test_demo['ID'].isin(df_bureau1['ID'].unique())]
test_bureau_without_future=test_bureau[~test_bureau['ID'].isin(df_bureau1['ID'].unique())]

In [None]:
df_test_demo[~df_test_demo['ID'].isin(test_with_future.append(test_without_future)['ID'].unique())]

In [None]:
freq_dict = {'Half Yearly': 4, 'Monthly': 2, 'Quatrly': 3, 'BI-Monthly': 1}
instl_dict = {'Arrear': 0, 'Advance': 1}
loan_dict = {'Closed': 0, 'Active': 1}
top_up_dict = {'No Top-up Service': 0,
 ' > 48 Months': 6,
 '36-48 Months': 5,
 '24-30 Months': 3,
 '30-36 Months': 4,
 '18-24 Months': 2,
 '12-18 Months': 1}

In [None]:
df_final = train_with_future.append(test_with_future)
df_final_hist = train_bureau_with_future.append(test_bureau_with_future)

In [None]:
df_final.shape,df_final_hist.shape

In [None]:
df_final_hist[df_final_hist['ID'] == 3].sort_values(by = ['DISBURSED-DT'])

In [None]:
df_final[df_final['LoanStatus'] == 'Active']['Top-up Month'].value_counts()

In [None]:
df_final_hist[df_final_hist['ID'] == 3].sort_values(by='DISBURSED-DT')

In [None]:
df_final_hist['ACCOUNT-STATUS'].value_counts()

In [None]:
df_final_hist['ACCOUNT-STATUS'].replace({
    'Delinquent':'BadLoan',
    'Suit Filed':'BadLoan',
    'Settled':'Closed',
    'SUIT FILED (WILFUL DEFAULT)':'BadLoan',
    'WILFUL DEFAULT':'BadLoan',
    'Cancelled':'BadLoan',
    'Restructured':'Active'},inplace=True)

In [None]:
cols_encode = ['SEX','City','Area','BranchID','SupplierID','City','ZiPCODE']
for i in cols_encode:
    le = LabelEncoder()
    df_final[i] = df_final[i].astype(str)
    df_final[i] = df_final[i].fillna('NaN')
    le.fit(df_final[i])
    df_final[i] = le.transform(df_final[i])

In [None]:
df_final['Frequency'] = df_final['Frequency'].map(freq_dict)
df_final['InstlmentMode'] = df_final['InstlmentMode'].map(instl_dict)
df_final['LoanStatus'] = df_final['LoanStatus'].map(loan_dict)
df_final['Top-up Month'] = df_final['Top-up Month'].map(top_up_dict)

In [None]:
def evaluate_macroF1_lgb(truth, predictions):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [None]:
df_final['Days_to_mat'] = (df_final['MaturityDAte'] - df_final['DisbursalDate']).dt.days
df_final['Asset_min_Dis'] = df_final['AssetCost'] - df_final['DisbursalAmount']
df_final['Perce_EMI_Asset'] = df_final['AssetCost'] / df_final['EMI']
df_final['Perce_EMI_Dis'] = df_final['DisbursalAmount'] / df_final['EMI']
df_final['Money_Paid_Per_Month'] = df_final['DisbursalAmount']/df_final['Tenure']
df_final['Per_EMI_Monthly_income'] = df_final['MonthlyIncome']/df_final['EMI']
df_final['FOIR'] = (df_final['EMI']/df_final['MonthlyIncome'])*100

In [None]:
df_final = pd.get_dummies(df_final,columns=['ManufacturerID','State','PaymentMode'])

In [None]:
def get_quarter(date):
    if date.month >= 1 and date.month <= 3:
        return 4
    if date.month >= 4 and date.month <= 6:
        return 1
    if date.month >= 7 and date.month <= 9:
        return 2
    if date.month >= 10 and date.month <= 12:
        return 3
    
def is_quarter_start(date):
    if date.month == 1 or date.month == 4 or date.month == 7 or date.month == 10:
        return 1
    else:
        return 0
    
def is_quarter_end(date):
    if date.month == 3 or date.month == 6 or date.month == 9 or date.month == 12:
        return 1
    else:
        return 0

def is_month(date,is_start=False):
    if is_start == False:
        if date.day >= 25:
            return 1
        else:
            return 0
    if is_start == True:
        if date.day <=5:
            return 1
        else:
            return 0

In [None]:
dates = ['DisbursalDate','MaturityDAte','AuthDate']
for i in dates:
    df_final[i + '_year'] = df_final[i].apply(lambda x: x.year)
    df_final[i + '_month'] = df_final[i].apply(lambda x: x.month)
    df_final[i + '_day'] = df_final[i].apply(lambda x: x.day)
    df_final[i + '_dayofweek'] = df_final[i].apply(lambda x: x.dayofweek)
    df_final[i + '_quarter'] = df_final[i].apply(lambda x:get_quarter(x))
    df_final[i + '_is_quarter_end'] = df_final[i].apply(lambda x:is_quarter_end(x))
    df_final[i + '_is_quarter_start'] = df_final[i].apply(lambda x: is_quarter_start(x))
    df_final[i + '_is_month_start'] = df_final[i].apply(lambda x:is_month(x,is_start=True))
    df_final[i + '_is_month_false'] = df_final[i].apply(lambda x:is_month(x,is_start=False))

In [None]:
df_final_hist[df_final_hist['ID'] == 123].sort_values(by='DISBURSED-DT')

In [None]:
df_final['DisbursalDate_year']

In [None]:
# df_final[['DisbursalDate_year','Top-up Month']].sort_values(by='DisbursalDate_year').head(50)

In [None]:
df_final_hist.shape

In [None]:
df_final_hist['TENURE'].fillna(0,inplace=True)
df_final_hist = df_final_hist[df_final_hist['DisbursalDate'].notnull()]
df_final_hist.drop_duplicates(inplace=True)
df_final_hist.reset_index(inplace=True)

In [None]:
dates = ['DISBURSED-DT']
for i in dates:
    df_final_hist[i + '_year'] = df_final_hist[i].apply(lambda x: x.year)
    df_final_hist[i + '_month'] = df_final_hist[i].apply(lambda x: x.month)
    df_final_hist[i + '_day'] =df_final_hist[i].apply(lambda x: x.day)
    df_final_hist[i + '_dayofweek'] = df_final_hist[i].apply(lambda x: x.dayofweek)
    df_final_hist[i + '_quarter'] = df_final_hist[i].apply(lambda x:get_quarter(x))
    df_final_hist[i + '_is_quarter_end'] = df_final_hist[i].apply(lambda x:is_quarter_end(x))
    df_final_hist[i + '_is_quarter_start'] = df_final_hist[i].apply(lambda x: is_quarter_start(x))
    df_final_hist[i + '_is_month_start'] = df_final_hist[i].apply(lambda x:is_month(x,is_start=True))
    df_final_hist[i + '_is_month_false'] = df_final_hist[i].apply(lambda x:is_month(x,is_start=False))

In [None]:
df_final_hist = df_final_hist.sort_values(by=['ID','DISBURSED-DT'])

In [None]:
df_final_hist

In [None]:
df_final_hist.reset_index(inplace=True,drop=True)

In [None]:
list_cols = ['DISBURSED-AMT/HIGH CREDIT','CURRENT-BAL','CREDIT-LIMIT/SANC AMT','OVERDUE-AMT']
for i in list_cols:
    df_final_hist[i] = df_final_hist[i].fillna('0')
    df_final_hist[i] = df_final_hist[i].apply(lambda x:eval(''.join(x.split(','))))

In [None]:
df_final_hist['INSTALLMENT-AMT'] = df_final_hist['INSTALLMENT-AMT'].fillna('0')
df_final_hist['INSTALLMENT-AMT'] = df_final_hist['INSTALLMENT-AMT'].apply(lambda x:eval(''.join(x.split('/')[0].split(','))))

In [None]:
def payment_date_diff(dis,close,last_payment):
    if pd.isna(close) == False and (close.year >= 2010 and close.year <=2020):
        return (close - dis).days
    elif pd.isna(close) == True and (last_payment.year >= 2010 and last_payment.year <=2020):
        return (last_payment - dis).days
    else:
        return 0
                                    
df_final_hist['Payment_date_diff'] = df_final_hist.apply(lambda x:payment_date_diff(x['DisbursalDate'],x['CLOSE-DT'],x['LAST-PAYMENT-DATE']),axis=1)

In [None]:
df_final_hist['SELF-INDICATOR_ACCT'] = df_final_hist['SELF-INDICATOR'].astype(str) + '_' + df_final_hist['ACCT-TYPE'].astype(str)
df_final_hist['SELF-INDICATOR_CONTRI'] = df_final_hist['SELF-INDICATOR'].astype(str) + '_' + df_final_hist['CONTRIBUTOR-TYPE'].astype(str)

In [None]:
df_final_hist['Report_Dis_sub'] = (df_final_hist['DATE-REPORTED'] - df_final_hist['DisbursalDate']).dt.days

In [None]:
df_final_hist = df_final_hist.sort_values(by=['DISBURSED-DT','ID'])

In [None]:
df_final_hist['DISBURSED-DT'] = df_final_hist['DISBURSED-DT'].astype(str)

In [None]:
temp = df_final_hist.sort_values(by=['DISBURSED-DT','ID']).groupby('ID').agg({'ID':['count'],
    'MATCH-TYPE':['nunique'],
    'ACCT-TYPE':['nunique',','.join],
    'OWNERSHIP-IND':['nunique'],
    'CONTRIBUTOR-TYPE':['nunique',','.join],
    'ACCOUNT-STATUS':['nunique'],
    'DISBURSED-AMT/HIGH CREDIT':['min','max','sum','mean','std'],
    'CURRENT-BAL':['min','max','sum','mean','std'],
    'CREDIT-LIMIT/SANC AMT':['min','max','sum','mean','std'],
    'OVERDUE-AMT':['min','max','sum','mean','std'],
    'WRITE-OFF-AMT':['min','max','sum','mean','std'],
    'Payment_date_diff':['min','max','sum','mean','std'],
    'INSTALLMENT-FREQUENCY':['nunique'],
    'SELF-INDICATOR_ACCT':['nunique',','.join],
    'TENURE':['min','max','sum','mean','std'],
    'Report_Dis_sub':['min','max','sum','mean','std'],
    'SELF-INDICATOR_CONTRI':['nunique',','.join],
    'DISBURSED-DT':[','.join],                                                                                                                                       
})
temp.columns = ['_ID_'.join(x) for x in temp.columns]
temp.reset_index(inplace=True)
temp['DisbursalDate_ID_join'] = temp['DISBURSED-DT_ID_join'].apply(lambda x:pd.to_datetime(x.split(',')[-1]))
df_final = pd.merge(df_final,temp,on='ID')

In [None]:
# agg_func = {
#     'next_1_year':['min','max','std'],
#     'next_2_year':['min','max','std'],
#     'next_1_day':['mean','std'],
#     'next_2_day':['mean','std'],
#     'next_1_month':['mean','std'],
#     'next_2_month':['mean','std'],}
# temp = df_final_hist.sort_values(by=['DISBURSED-DT','ID']).groupby('ID').agg(agg_func)
# temp.columns = ['_ID_'.join(x) for x in temp.columns]
# temp.reset_index(inplace=True)
# df_final = pd.merge(df_final,temp,on='ID')

In [None]:
# temp

In [None]:
df_final['Recency'] = (df_final['DisbursalDate_ID_join'] - df_final['DisbursalDate']).dt.days

In [None]:
cols_binarize = ['ACCT-TYPE_ID_join','CONTRIBUTOR-TYPE_ID_join','SELF-INDICATOR_ACCT_ID_join','SELF-INDICATOR_CONTRI_ID_join']
for i in cols_binarize:
    print(i)
    df_final[i] = df_final[i].fillna('NaN')
    df_final[i] = df_final[i].apply(lambda x:x.split(','))
    df_final = df_final.join(pd.DataFrame(mlb.fit_transform(df_final.pop(i)),columns=mlb.classes_,index=df_final.index))

In [None]:
df_final_hist = df_final_hist.sort_values(by=['ID','DisbursalDate'])

In [None]:
df_final_hist.reset_index(inplace=True,drop=True)

In [None]:
df_final_hist['DISBURSED-DT'] = pd.to_datetime(df_final_hist['DISBURSED-DT'])

In [None]:
def approx_dates_top_up(demo,hist):
    demo.reset_index(inplace=True,drop=True)
    id_cust = demo['ID'][0]
    temp = hist[hist['ID'] == id_cust]
    temp.reset_index(inplace=True,drop=True)
    ref_row_idx = None
    for index, row in temp.iterrows():
        if demo['DisbursalDate'][0] == row['DISBURSED-DT']:
            ref_row_idx = index
    if ref_row_idx != None:
        row_temp = temp.iloc[ref_row_idx]
        row_act_type = row_temp['ACCT-TYPE']
        row_date = row_temp['DISBURSED-DT']
        top_up_days = 0
        for index, row in temp.iterrows():
            if (row['ACCT-TYPE'] == row_act_type and row['DISBURSED-DT'] > row_date) and row['SELF-INDICATOR'] == True:
                top_up_days = (row['DISBURSED-DT'] - row_date).days
            else:
                zero_top_up = 0
        return top_up_days,zero_top_up
    else:
        top_up_days = np.nan
        zero_top_up = np.nan
        return top_up_days,zero_top_up

In [None]:
def foir(demo,hist):
    demo.reset_index(inplace=True,drop=True)
    id_cust = demo['ID'][0]
    temp = hist[hist['ID'] == id_cust]
    temp.reset_index(inplace=True,drop=True)
    ref_row_idx = None
    for index, row in temp.iterrows():
        if demo['DisbursalDate'][0] == row['DISBURSED-DT']:
            ref_row_idx = index
    if ref_row_idx != None:
        row_temp = temp.iloc[ref_row_idx]
        row_date = row_temp['DISBURSED-DT']
        emi = []
        for index, row in temp.iterrows():
            if (row['DISBURSED-DT'] > row_date and row['ACCOUNT-STATUS'] == 'Active'):
                emi.append(row['INSTALLMENT-AMT'])
        return sum(emi)
    else:
        emi = []
        for index, row in temp.iterrows():
            if (demo['DisbursalDate'][0] < row['DISBURSED-DT'] and row['ACCOUNT-STATUS'] == 'Active'):
                emi.append(row['INSTALLMENT-AMT'])
        return sum(emi)
                

In [None]:
def cltv(demo,hist):
    demo.reset_index(inplace=True,drop=True)
    id_cust = demo['ID'][0]
    temp = hist[hist['ID'] == id_cust]
    temp.reset_index(inplace=True,drop=True)
    ref_row_idx = None
    for index, row in temp.iterrows():
        if demo['DisbursalDate'][0] == row['DISBURSED-DT']:
            ref_row_idx = index
    if ref_row_idx != None:
        row_temp = temp.iloc[ref_row_idx]
        row_date = row_temp['DISBURSED-DT']
        total_amount_disbursed = []
        for index, row in temp.iterrows():
            if (row['DISBURSED-DT'] > row_date and row['ACCOUNT-STATUS'] == 'Active'):
                total_amount_disbursed.append(row['DISBURSED-AMT/HIGH CREDIT'])
        return sum(total_amount_disbursed)
    else:
        total_amount_disbursed = []
        for index, row in temp.iterrows():
            if (demo['DisbursalDate'][0] < row['DISBURSED-DT'] and row['ACCOUNT-STATUS'] == 'Active'):
                total_amount_disbursed.append(row['DISBURSED-AMT/HIGH CREDIT'])
        return sum(total_amount_disbursed)

In [None]:
def current_bal(demo,hist):
    demo.reset_index(inplace=True,drop=True)
    id_cust = demo['ID'][0]
    temp = hist[hist['ID'] == id_cust]
    temp.reset_index(inplace=True,drop=True)
    ref_row_idx = None
    for index, row in temp.iterrows():
        if demo['DisbursalDate'][0] == row['DISBURSED-DT']:
            ref_row_idx = index
    if ref_row_idx != None:
        row_temp = temp.iloc[ref_row_idx]
        row_date = row_temp['DISBURSED-DT']
        total_amount_disbursed = []
        for index, row in temp.iterrows():
            if (row['DISBURSED-DT'] > row_date and row['ACCOUNT-STATUS'] == 'Active'):
                total_amount_disbursed.append(row['CURRENT-BAL'])
        return sum(total_amount_disbursed)
    else:
        total_amount_disbursed = []
        for index, row in temp.iterrows():
            if (demo['DisbursalDate'][0] < row['DISBURSED-DT'] and row['ACCOUNT-STATUS'] == 'Active'):
                total_amount_disbursed.append(row['CURRENT-BAL'])
        return  sum(total_amount_disbursed)
        

In [None]:
def check_accs_open_closed(demo,hist):
    demo.reset_index(inplace=True,drop=True)
    id_cust = demo['ID'][0]
    temp = hist[hist['ID'] == id_cust]
    temp.reset_index(inplace=True,drop=True)
    ref_row_idx = None
    for index, row in temp.iterrows():
        if demo['DisbursalDate'][0] == row['DISBURSED-DT']:
            ref_row_idx = index
    if ref_row_idx != None:
        row_temp = temp.iloc[ref_row_idx]
        row_date = row_temp['DISBURSED-DT']
        accs_closed = []
        accs_open = []
        accs_other = []
        for index, row in temp.iterrows():
            if row['DISBURSED-DT'] > row_date:
                if row['ACCOUNT-STATUS'] == 'Closed':
                    accs_closed.append(1)
                elif row['ACCOUNT-STATUS'] == 'Active':
                    accs_open.append(1)
                else:
                    accs_other.append(1)
        return sum(accs_closed),sum(accs_open),sum(accs_other)
    else:
        accs_closed = []
        accs_open = []
        accs_other = []
        for index, row in temp.iterrows():
            if demo['DisbursalDate'][0] < row['DISBURSED-DT']:
                if row['ACCOUNT-STATUS'] == 'Closed':
                    accs_closed.append(1)
                elif row['ACCOUNT-STATUS'] == 'Active':
                    accs_open.append(1)
                else:
                    accs_other.append(1)
        return sum(accs_closed),sum(accs_open),sum(accs_other)

In [None]:
# final_dict = {}
# for index,row in tqdm(df_final.iterrows()):
#     test_demo = df_final[df_final['ID'] == row['ID']]
#     test_hist=df_final_hist[df_final_hist['ID'] == row['ID']]
#     top_up_days,zero_top_up = approx_dates_top_up(test_demo,test_hist)
#     if top_up_days !=0:
#         final_dict[row['ID']] = top_up_days
#     else:
#         final_dict[row['ID']] = zero_top_up

In [None]:
# final_dict_foir = {}
# for index,row in tqdm(df_final.iterrows()):
#     test_demo = df_final[df_final['ID'] == row['ID']]
#     test_hist=df_final_hist[df_final_hist['ID'] == row['ID']]
#     emi = foir(test_demo,test_hist)
#     if emi !=0:
#         final_dict_foir[row['ID']] = emi 
#     else:
#         final_dict_foir[row['ID']] = -9999

In [None]:
# final_dict_dis = {}
# for index,row in tqdm(df_final.iterrows()):
#     test_demo = df_final[df_final['ID'] == row['ID']]
#     test_hist=df_final_hist[df_final_hist['ID'] == row['ID']]
#     emi= cltv(test_demo,test_hist)
#     if emi !=0:
#         final_dict_dis[row['ID']] = emi 
#     else:
#         final_dict_dis[row['ID']] = -9999

In [None]:
# final_dict_curr = {}
# for index,row in tqdm(df_final.iterrows()):
#     test_demo = df_final[df_final['ID'] == row['ID']]
#     test_hist=df_final_hist[df_final_hist['ID'] == row['ID']]
#     emi = current_bal(test_demo,test_hist)
#     if emi !=0:
#         final_dict_curr[row['ID']] = emi 
#     else:
#         final_dict_curr[row['ID']] = -9999

In [None]:
# final_dict_acc = {}
# for index,row in tqdm(df_final.iterrows()):
#     test_demo = df_final[df_final['ID'] == row['ID']]
#     test_hist=df_final_hist[df_final_hist['ID'] == row['ID']]
#     accs_closed,accs_open,accs_other = check_accs_open_closed(test_demo,test_hist)
#     final_dict_acc[row['ID']] = [accs_closed,accs_open,accs_other] 

In [None]:
# df_approx_top = pd.DataFrame.from_dict(final_dict,orient='index',columns=['Approx_Days_to_top_up'])
# df_foir = pd.DataFrame.from_dict(final_dict_foir,orient='index',columns=['Active_FOIR'])
# df_dis_cltv = pd.DataFrame.from_dict(final_dict_dis,orient='index',columns=['Active_dis_cltv'])
# df_curr_cltv = pd.DataFrame.from_dict(final_dict_curr,orient='index',columns=['Active_curr_cltv'])
# df_acc = pd.DataFrame.from_dict(final_dict_acc,orient='index',columns=['Acc_closed','Acc_Open','Acc_other'])
# df_acc.reset_index(inplace=True)
# df_acc.rename(columns= {'index':'ID'},inplace=True)

In [None]:
# df_approx_top.reset_index(inplace=True)
# df_foir.reset_index(inplace=True)
# df_dis_cltv.reset_index(inplace=True)
# df_curr_cltv.reset_index(inplace=True)

In [None]:
# df_approx_top.rename(columns= {'index':'ID'},inplace=True)
# df_foir.rename(columns= {'index':'ID'},inplace=True)
# # df_dis_cltv.rename(columns= {'index':'ID'},inplace=True)
# df_curr_cltv.rename(columns= {'index':'ID'},inplace=True)

In [None]:
# df_foir['Active_FOIR'].replace({-9999:np.nan},inplace=True)
# df_dis_cltv['Active_dis_cltv'].replace({-9999:np.nan},inplace=True)
# df_curr_cltv['Active_curr_cltv'].replace({-9999:np.nan},inplace=True)

In [None]:
def get_avg_dates_next_loans(dates):
    if len(dates.tolist())>1:
        final_list = []
        dates = dates.tolist()
        for i in range(1,len(dates)):
            x = dates[i] - dates[i-1]
            final_list.append(x.days)
        return np.mean(final_list)
    else:
        return 0

def get_next_loan_dates(dates):
    if len(dates.tolist())>1:
        final_list = []
        dates = dates.tolist()
        for i in range(1,len(dates)):
            x = dates[i] - dates[i-1]
            final_list.append((x.days/30.71))
        return final_list
    else:
        return 0

def calculate_month_less_than_12(timedifference):
    if timedifference < 12 :
        value=1
    else:
        value=0
    return value
def calculate_between_12_18(timedifference):
    if (timedifference > 12) & (timedifference < 18) :
        value=1
    else:
        value=0
    return value
def calculate_between_18_24(timedifference):
    if (timedifference >= 18) & (timedifference < 24) :
        value=1
    else:
        value=0
    return value
def calculate_between_24_30(timedifference):
    if (timedifference >= 24) & (timedifference < 30) :
        value=1
    else:
        value=0
    return value
def calculate_between_30_36(timedifference):
    if (timedifference >= 30) & (timedifference < 36) :
        value=1
    else:
        value=0
    return value
def calculate_between_36_48(timedifference):
    if (timedifference >= 36) & (timedifference <= 48) :
        value=1
    else:
        value=0
    return value
def calculate_greater_than_48(timedifference):
    if timedifference > 48 :
        value=1
    else:
        value=0
    return value

In [None]:
def calculate_month_less_than_12_amts(timedifference,i):
    if timedifference < 12 :
        value=i
    else:
        value=np.nan
    return value
def calculate_between_12_18_amts(timedifference,i):
    if (timedifference > 12) & (timedifference < 18) :
        value=i
    else:
        value=np.nan
    return value
def calculate_between_18_24_amts(timedifference,i):
    if (timedifference >= 18) & (timedifference < 24) :
        value=i
    else:
        value=np.nan
    return value
def calculate_between_24_30_amts(timedifference,i):
    if (timedifference >= 24) & (timedifference < 30) :
        value=i
    else:
        value=np.nan
    return value
def calculate_between_30_36_amts(timedifference,i):
    if (timedifference >= 30) & (timedifference < 36) :
        value=i
    else:
        value=np.nan
    return value
def calculate_between_36_48_amts(timedifference,i):
    if (timedifference >= 36) & (timedifference <= 48) :
        value=i
    else:
        value=np.nan
    return value
def calculate_greater_than_48_amts(timedifference,i):
    if timedifference > 48 :
        value=i
    else:
        value=np.nan
    return value

In [None]:
def calculate_tdf(timedifference,status,diff_1,diff_2):
    if (timedifference >= diff_1 and timedifference < diff_2) and (status == True):
        value = 1
    else:
        value = 0
    return value

In [None]:
df_final_hist['SELF-INDICATOR']

In [None]:
df_final_hist['Time_Difference']=(df_final_hist['DISBURSED-DT']-df_final_hist['DisbursalDate']).dt.days
df_final_hist['Time_Difference']=df_final_hist['Time_Difference']/30.71
df_final_hist['Less_than_12_month']=df_final_hist.apply(lambda x: calculate_month_less_than_12(x['Time_Difference']),axis=1)
df_final_hist['12-18']=df_final_hist.apply(lambda x: calculate_between_12_18(x['Time_Difference']),axis=1)
df_final_hist['18-24']=df_final_hist.apply(lambda x: calculate_between_18_24(x['Time_Difference']),axis=1)
df_final_hist['24-30']=df_final_hist.apply(lambda x: calculate_between_24_30(x['Time_Difference']),axis=1)
df_final_hist['30-36']=df_final_hist.apply(lambda x: calculate_between_30_36(x['Time_Difference']),axis=1)
df_final_hist['36-48']=df_final_hist.apply(lambda x: calculate_between_36_48(x['Time_Difference']),axis=1)
df_final_hist['48_plus']=df_final_hist.apply(lambda x:calculate_greater_than_48(x['Time_Difference']),axis=1)


df_final_hist['Less_than_12_month']=df_final_hist.groupby('ID')['Less_than_12_month'].transform('max')
df_final_hist['12-18']=df_final_hist.groupby('ID')['12-18'].transform('max')
df_final_hist['18-24']=df_final_hist.groupby('ID')['18-24'].transform('max')
df_final_hist['24-30']=df_final_hist.groupby('ID')['24-30'].transform('max')
df_final_hist['30-36']=df_final_hist.groupby('ID')['36-48'].transform('max')
df_final_hist['36-48']=df_final_hist.groupby('ID')['36-48'].transform('max')
df_final_hist['48_plus']=df_final_hist.groupby('ID')['48_plus'].transform('max')


df_final_hist['Less_than_12_month_amts']=df_final_hist.apply(lambda x: calculate_month_less_than_12_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['12-18_amts']=df_final_hist.apply(lambda x: calculate_between_12_18_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['18-24_amts']=df_final_hist.apply(lambda x: calculate_between_18_24_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['24-30_amts']=df_final_hist.apply(lambda x: calculate_between_24_30_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['30-36_amts']=df_final_hist.apply(lambda x: calculate_between_30_36_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['36-48_amts']=df_final_hist.apply(lambda x: calculate_between_36_48_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)
df_final_hist['48_plus_amts']=df_final_hist.apply(lambda x:calculate_greater_than_48_amts(x['Time_Difference'],x['DISBURSED-AMT/HIGH CREDIT']),axis=1)




In [None]:
# df_final_hist['SELF-INDICATOR']

In [None]:
df_final_hist['Less_than_12_month_amts']=df_final_hist.groupby('ID')['Less_than_12_month_amts'].transform('mean')
df_final_hist['12-18_amts']=df_final_hist.groupby('ID')['12-18_amts'].transform('mean')
df_final_hist['18-24_amts']=df_final_hist.groupby('ID')['18-24_amts'].transform('mean')
df_final_hist['24-30_amts']=df_final_hist.groupby('ID')['24-30_amts'].transform('mean')
df_final_hist['30-36_amts']=df_final_hist.groupby('ID')['30-36_amts'].transform('mean')
df_final_hist['36-48_amts']=df_final_hist.groupby('ID')['36-48_amts'].transform('mean')
df_final_hist['48_plus_amts']=df_final_hist.groupby('ID')['48_plus_amts'].transform('mean')

In [None]:
cols_join = ['ID','DISBURSED-DT','Less_than_12_month','12-18','18-24','24-30','30-36','36-48','48_plus','Less_than_12_month_amts',
        '12-18_amts','18-24_amts','24-30_amts','30-36_amts','36-48_amts','48_plus_amts']

In [None]:
df_final = pd.merge(df_final,df_final_hist[cols_join],left_on=['ID','DisbursalDate'],right_on=['ID','DISBURSED-DT'],how='left')

In [None]:
df_final.drop_duplicates(inplace=True)

In [None]:
df_final[df_final.columns[-7:]]

In [None]:
loans_next_avg = df_final_hist.groupby('ID').agg({'DISBURSED-DT':[get_avg_dates_next_loans]})
loans_next_avg.columns = ['_loans_'.join(x) for x in loans_next_avg.columns]
loans_next_avg.reset_index(inplace=True)
 

In [None]:
loans_next = df_final_hist.groupby('ID').agg({'DISBURSED-DT':[get_next_loan_dates]})
loans_next.columns = ['_loans_'.join(x) for x in loans_next]
loans_next.reset_index(inplace=True)

In [None]:
df_approx_top=pd.read_csv('Data_Future/df_approx_top.csv')
df_foir= pd.read_csv('Data_Future/df_foir.csv')
df_dis_cltv=pd.read_csv('Data_Future/df_dis_cltv.csv')
df_curr_cltv=pd.read_csv('Data_Future/df_curr_cltv.csv')
df_acc = pd.read_csv('Data_Future/df_acc.csv')

In [None]:
df_final = pd.merge(df_final,loans_next_avg,on='ID')
df_final = pd.merge(df_final,loans_next,on='ID')
df_final = pd.merge(df_final,df_approx_top,on='ID')
df_final = pd.merge(df_final,df_foir,on='ID')

In [None]:
df_final = pd.merge(df_final,df_dis_cltv,on='ID')
df_final = pd.merge(df_final,df_curr_cltv,on='ID')
df_final = pd.merge(df_final,df_acc,on='ID')

In [None]:
df_final['Active_FOIR_Actual'] = (df_final['Active_FOIR']/df_final['MonthlyIncome'])*100
df_final['Approx_Days_to_top_up_Month'] = df_final['Approx_Days_to_top_up']/30.71
df_final['Active_cltv_dis_act'] =np.abs((df_final['Active_dis_cltv']/df_final['AssetCost']))*100
df_final['Active_cltv_current_act'] =np.abs((df_final['Active_curr_cltv']/df_final['AssetCost']))*100

In [None]:
df_final['Approx_Days_to_top_up'].replace({0:np.nan},inplace=True)
df_final['Approx_Days_to_top_up_Month'].replace({0:np.nan},inplace=True)

In [None]:
df_final['Active_FOIR'].value_counts()

In [None]:
df_final[df_final.columns[-20:]]

In [None]:
X_train = df_final[df_final['Top-up Month'].notnull()]
y_train = X_train['Top-up Month']
df_test = df_final[df_final['Top-up Month'].isnull()]
X_train.drop(['Top-up Month'],axis=1,inplace=True)
df_test.drop(['Top-up Month'],axis=1,inplace=True)

In [None]:
date_cols = ['DisbursalDate','MaturityDAte','AuthDate','ID','AssetID','DisbursalDate_ID_join','DISBURSED-DT_ID_join',
             'DISBURSED-DT_loans_get_next_loan_dates','DISBURSED-DT']

In [None]:
# param = {'num_leaves': 64,
#          'min_data_in_leaf': 5, 
#          'max_depth': -1,
#          'learning_rate': 0.1,
#          "boosting": "gbdt",
#          "feature_fraction": 0.405,
#          "lambda_l1": 1,
#          "lambda_l2": 4,
#          "verbosity": -1,
#          'two_round': True,
#          'cat_smooth': 0,
#           'cat_l2': 1}

In [None]:
n_folds=5
folds=StratifiedKFold(n_splits=n_folds,shuffle=True,random_state=22)
avg_cv = []
final_preds = np.zeros((len(df_test), 7))
final_preds_imp = pd.DataFrame()
oof_preds = np.zeros((len(X_train),7))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, y_train.values)):
        print("Fold {}".format(fold_))
        X_trn,y_trn = X_train.drop(date_cols,axis=1).iloc[trn_idx],y_train.iloc[trn_idx]
        X_val,y_val = X_train.drop(date_cols,axis=1).iloc[val_idx],y_train.iloc[val_idx]
        clf = lgb.LGBMClassifier(random_state=22,n_jobs=-1,n_estimators=5000,metric='custom',class_weight='balanced')
#         clf = XGBClassifier(random_state=22,n_jobs=-1,n_estimators=2000,class_weight='balanced')
#         clf.fit(X_trn,y_trn,eval_set=[(X_val,y_val)],early_stopping_rounds=100)
        clf.fit(X_trn, y_trn,eval_metric=evaluate_macroF1_lgb,eval_set=[(X_val,y_val)],verbose=False,early_stopping_rounds=100)
        
#         imp = importances(clf,X_val,y_val)
#         imp.reset_index(inplace=True)
#         final_preds_imp['Feature'] = X_val.columns
#         imp.rename(columns = {'Importance':f'importances{fold_}'},inplace=True)
#         final_preds_imp = pd.merge(final_preds_imp,imp,on='Feature')

        avg_cv.append(f1_score(y_pred=clf.predict(X_val),y_true=y_val,average='macro'))
        print(f'CV_FOLD_{fold_} : {avg_cv[fold_]}')
        final_preds += clf.predict_proba(df_test.drop(date_cols,axis=1))

final_preds = final_preds/n_folds
print(sum(avg_cv)/n_folds)

In [None]:
df_test['Top-up Month'] = [np.argmax(x) for x in final_preds]
df_test['Top-up Month'].value_counts()

In [None]:
top_up_dict_rev = {0:'No Top-up Service',
 6:' > 48 Months',
 5:'36-48 Months',
 3:'24-30 Months',
 4:'30-36 Months',
 2:'18-24 Months',
 1:'12-18 Months'}

In [None]:
df_test['Top-up Month'] = df_test['Top-up Month'].map(top_up_dict_rev)
df_test[['ID','Top-up Month']].to_csv('Data_Future/Future_data_preds.csv',index=False)