In [1]:
import os
import pandas as pd 
import glob
import random 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import math

In [2]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',680)

## Read in data: 

In [3]:
os.listdir()

['.ipynb_checkpoints',
 '1_Pure_Truncation_Start_Bal_Ella_Version2-editedJulia.ipynb',
 '2_loan_outcome_determination.ipynb',
 '3_individual_outcome_determination.ipynb',
 '4_merge_predictors_with_outcome.ipynb',
 '5_format_data.ipynb',
 '6_train_test_split.ipynb',
 '7_MLexplore-Ella.ipynb',
 '7_MLexplore.ipynb',
 'all_data.csv',
 'binary_test.csv',
 'binary_train.csv',
 'binary_x_test.csv',
 'binary_x_train.csv',
 'binary_y_test.csv',
 'binary_y_train.csv',
 'data_merged_nocatvars.csv',
 'data_merged_noMissingData.csv',
 'diaries_trx_trunc_loans_start_bal.csv',
 'heatmap.jpg',
 'individual_outcomes_33_150.csv',
 'loan_outcomes_33_150.csv',
 'Pure_Truncation_Start_Bal_Ella_Version2.ipynb']

Step 1: Inspect data

In [4]:
trx = pd.read_csv('diaries_trx_trunc_loans_start_bal.csv', low_memory = False, dtype = {'new_account_ids': 'object'})

In [5]:
trx.shape

(1563, 64)

In [6]:
trx.loc[trx['unique_accnts']==1].trx_type_desc.value_counts()

Borrowing from an informal Group             104
Consumer/ personal loan (not payday loan)     26
Individual Business or Agriculture Loan       25
Joint liability loan                          13
Name: trx_type_desc, dtype: int64

In [7]:
trx.loc[trx['unique_accnts']==1].trx_type_code.value_counts()

3395    104
2762     26
2760     25
3247     13
Name: trx_type_code, dtype: int64

Step 2: Let's look at regularity a bit

In [8]:
# for idx, acct_id in enumerate(list(trx.account_ids.unique())):
#     print(idx, acct_id)
#     loan_df = trx.loc[trx['account_ids']==acct_id]    
#     loan_df = loan_df.copy()
#     loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
    
#     #subset down only to payments
#     payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
#     payment_df= loan_df.iloc[payment_idxs]
    
#     #figure out a vector of time between payments
#     time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
    
#     print('time btw payments', time_btw_payments)
#     print('std of time btw payments', np.std(time_btw_payments))
#     print('mean of payment amount', np.mean(payment_df.trx_value_kes))
#     print('-------------------------')

Step 3: Test out a single loan: 

In [9]:
acct_id = list(trx.new_account_ids.unique())[0]

In [10]:
loan_df = trx.loc[trx['new_account_ids']==acct_id]    
loan_df = loan_df.copy()
loan_df.sort_values("trx_stdtime_days_acc", inplace=True)

In [11]:
loan_df.trx_stdtime_days_acc.max()

350

In [12]:
loan_df.shape

(3, 64)

In [13]:
loan_df.columns

Index(['Unnamed: 0', 'hh_ids', 'unique_hhs', 'first_trx_date_hh',
       'last_trx_date_hh', 'tot_hh_daysofobs', 'tot_hh_monthsofobs',
       'interview_designation', 'int_date', 'int_month', 'int_year',
       'int_yr_mo', 'first_int_date', 'account_ids', 'new_account_ids',
       'unique_accnts', 'm_ids_owner', 'unique_hm_owner',
       'account_bsheet_desig', 'account_startclose_balance', 'account_formal',
       'account_liquid', 'first_trx_date_acc', 'last_trx_date_acc',
       'tot_acc_daysofobs', 'tot_acc_monthsofobs', 'trx_id', 'm_ids_trx',
       'trx_date', 'trx_month', 'trx_year', 'trx_yr_mo', 'trx_dq_round',
       'trx_stdtime_days_hh', 'trx_stdtime_mnths_hh', 'trx_stdtime_days_acc',
       'trx_stdtime_mnths_acc', 'trx_class_code', 'trx_class_desc',
       'trx_family_code', 'trx_family_desc', 'trx_type_code', 'trx_type_desc',
       'trx_prx_purpose', 'trx_prx_purpose_fd', 'trx_fee',
       'trx_bsheet_direction', 'trx_mode_code', 'trx_mode_desc',
       'trx_place_incom

In [14]:
days_observed = loan_df.trx_stdtime_days_acc.max()- loan_df.trx_stdtime_days_acc.min()
x_days = 150

In [15]:
if days_observed >= x_days: 
    thresh_day = loan_df.trx_stdtime_days_acc.min()+150 
    loan_x_days = loan_df.loc[loan_df['trx_stdtime_days_acc'] <= thresh_day].copy()
else:     
    loan_x_days = loan_df.copy()

In [16]:
len(list(loan_x_days.bal_evol)) > 0

True

In [17]:
balance_at_x_days = list(loan_x_days.bal_evol)[-1]
balance_at_x_days

0.0

In [18]:
balance_at_x_days <= 0

True

Step 4: Put it into a function: has a loan been paid back in x_days? 

In [19]:
def find_loans_paid_back_in_x_days(df, x_days):
    loan_categories = pd.DataFrame()
    for idx, acct_id in enumerate(list(df.new_account_ids.unique())):
#         print(idx, acct_id)
        
        loan_df = df.loc[df['new_account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]
        key = str('paid_back_in_'+ str(x_days)+'_days')
        
        
        ## find the number of days the loan was observed: 
        days_observed = loan_df.trx_stdtime_days_acc.max()- loan_df.trx_stdtime_days_acc.min()
        
        ## if the loan is obeserved for more than 150 days, we need to just look and see if the last balance
        ## on the last transaction day on before (first day+150) is <= 0
        ## then we create a new df that is less than or equal to our thresh day, otherwise we just use that same df 
        
        if days_observed >= x_days: 
            thresh_day = loan_df.trx_stdtime_days_acc.min()+150 
            loan_x_days = loan_df.loc[loan_df['trx_stdtime_days_acc'] <= thresh_day].copy()
        else:     
            loan_x_days = loan_df.copy()
        
        ## if the balance evolution is greater than 0
        if len(list(loan_x_days.bal_evol)) > 0:
        
            balance_at_x_days = list(loan_x_days.bal_evol)[-1]

            if balance_at_x_days <= 0: 
                loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 1}
            else: 
                loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 0}
        else:
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 'too_few_borrowings'}
        
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    return loan_categories

In [20]:
trx.shape

(1563, 64)

In [21]:
loans_paid_back_in_150_days_df = find_loans_paid_back_in_x_days(trx, 150)

In [22]:
loans_paid_back_in_150_days_df

Unnamed: 0,m_ids_owner,new_account_ids,paid_back_in_150_days
0,65134441430300000,5.61348043746e+16,1.0
1,60134547419200000,6.01374307109e+16,0.0
2,63134425702500000,6.31367405497e+16,0.0
3,64134429266300000,8.91364591805e+16,1.0
4,65134432186900000,1.051365401401e+17,1.0
5,65134442822400000,5.61347619278e+16,0.0
6,63134364423800000,6.31371825239e+16,0.0
7,59134388500400000,5.9134985308e+16,1.0
8,61134406652800000,1.121372833516e+17,0.0
9,57134295592300000,5.71346675055e+16,0.0


Step 5: Create function that takes in full trx dataset, a dataset that says whether they pay back in x days, x_days and a threshold for "regularity" of payments 

In [23]:
def find_regular_loans(df, loans_paid_back_in_x_days_df, thresh, x_days):
    
    # df is df of all trx of loans
    # thresh is the cutoff for num days between loans 
    # x_days = days of observation of the loan 

    loan_categories = pd.DataFrame()
    
    for idx, acct_id in enumerate(list(df.new_account_ids.unique())):
        
#         print(idx, acct_id)
        
        loan_df = df.loc[df['new_account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]
        key1 = 'every_payment_under_'+str(thresh)+'_days'
        

        #subset down only to payments
        payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
        payment_df= loan_df.iloc[payment_idxs]
        
        # calculate the array of time between payments: 
        time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
        
        # starting time 
        starting_time = list(loan_df.trx_stdtime_days_acc)[0]
        ending_time = list(loan_df.trx_stdtime_days_acc)[-1]
        num_to_add_to_time_btw_payments = x_days+starting_time - ending_time
        
        key_lookup = 'paid_back_in_'+str(x_days)+'_days'
        
        paid_back_x_days = loans_paid_back_in_x_days_df.loc[loans_paid_back_in_x_days_df['new_account_ids']==acct_id][key_lookup]
        paid_back_x_days = int(paid_back_x_days)
        if paid_back_x_days == 0: 
            np.append(time_btw_payments, num_to_add_to_time_btw_payments)
        else: 
            time_btw_payments = time_btw_payments
        
#         print('time btw payments', time_btw_payments)
#         print('std of time btw payments', np.std(time_btw_payments))
#         print('mean of payment amount', np.mean(payment_df.trx_value_kes))
#         print('-------------------------')

        ints_over_thresh = [x for x in  time_btw_payments if x > thresh]
        
        if len(ints_over_thresh)>0: 
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 0}
        else: 
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 1}
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    return loan_categories

In [25]:
regular_loanpayers = find_regular_loans(trx, loans_paid_back_in_150_days_df, 33, 150)

Step 6: Test the merge 

In [28]:
merged_outcomes = loans_paid_back_in_150_days_df.merge(regular_loanpayers, 'outer', ['new_account_ids', 'm_ids_owner'])

In [29]:
merged_outcomes.shape

(271, 4)

In [30]:
def create_outcomes(trx_df, thresh, x_days): 
    
    x = find_loans_paid_back_in_x_days(trx_df, x_days)
    y = find_regular_loans(trx_df, x, thresh, x_days)
    
    z = x.merge(y, 'outer', ['new_account_ids', 'm_ids_owner'])
    return z 

In [31]:
loan_outcomes = create_outcomes(trx, 33, 150)

In [32]:
loan_outcomes.head(20)

Unnamed: 0,m_ids_owner,new_account_ids,paid_back_in_150_days,every_payment_under_33_days
0,65134441430300000,5.61348043746e+16,1.0,1.0
1,60134547419200000,6.01374307109e+16,0.0,0.0
2,63134425702500000,6.31367405497e+16,0.0,1.0
3,64134429266300000,8.91364591805e+16,1.0,1.0
4,65134432186900000,1.051365401401e+17,1.0,1.0
5,65134442822400000,5.61347619278e+16,0.0,0.0
6,63134364423800000,6.31371825239e+16,0.0,1.0
7,59134388500400000,5.9134985308e+16,1.0,1.0
8,61134406652800000,1.121372833516e+17,0.0,0.0
9,57134295592300000,5.71346675055e+16,0.0,0.0


In [33]:
loan_outcomes.shape

(271, 4)

In [34]:
loan_outcomes.to_csv('loan_outcomes_33_150.csv', index = False)