In [1]:
import os
import pandas as pd 
import glob
import random 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import math

In [2]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',680)

## Read in data: 

In [3]:
os.listdir()

['categorize_loanpayer_per_person.ipynb',
 'numPeople_MultipleLoans.ipynb',
 '.DS_Store',
 'everyPaymentUnder35_paidWithin150.csv',
 'graph_loans_bus_ag_julia.ipynb',
 'different_loan_types_julia.ipynb',
 'Demographics_Education_Julia.ipynb',
 'truncating_loans_bus_ag_julia.ipynb',
 'Start_Balance_Ella-Julia.ipynb',
 'diaries_trx_trunc_loans.csv',
 'everyPaymentUnder33_paidWithin150.csv',
 'one_time_payments_julia.ipynb',
 'metrics_oneTimePayments.ipynb',
 'Truncation_Ella-Julia.ipynb',
 'consumption_julia.ipynb',
 'everyPaymentUnder33_paidWithin150_paid4times.csv',
 'steady_payments_or_not.ipynb',
 'diaries_trx_trunc_loans_start_bal.csv',
 '.ipynb_checkpoints',
 'graphs',
 'steady_payments_or_not-pureTruncElla.ipynb',
 'Pure_Truncation_Ella-Julia.ipynb',
 'initial_exploration_julia.ipynb']

In [4]:
trx = pd.read_csv('diaries_trx_trunc_loans_start_bal.csv', low_memory = False, dtype = {'new_account_ids': 'object'})

##  Formal Loans (FRMLN)

Step 1: Subset to loans w/ FRMLN designation: 

In [5]:
frmln_trx = trx.loc[trx['trx_family_code']=="FRMLN"]
frmln_trx.shape

(717, 64)

Step 2: What are the different types of formal loans and how many are there? 

In [6]:
frmln_trx.loc[frmln_trx['unique_accnts']==1].trx_type_desc.value_counts()

Individual Business or Agriculture Loan      30
Consumer/ personal loan (not payday loan)    28
Joint liability loan                         17
Name: trx_type_desc, dtype: int64

In [7]:
frmln_trx.loc[frmln_trx['unique_accnts']==1].trx_type_code.value_counts()

2760    30
2762    28
3247    17
Name: trx_type_code, dtype: int64

Step 2: Let's look at standard deviations of the payment amounts: 

In [8]:
for idx, acct_id in enumerate(list(frmln_trx.account_ids.unique())):
    print(idx, acct_id)
    loan_df = frmln_trx.loc[frmln_trx['account_ids']==acct_id]    
    loan_df = loan_df.copy()
    loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
    
    #subset down only to payments
    payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
    payment_df= loan_df.iloc[payment_idxs]
    
    #figure out a vector of time between payments
    time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
    
    print('time btw payments', time_btw_payments)
    print('std of time btw payments', np.std(time_btw_payments))
    print('mean of payment amount', np.mean(payment_df.trx_value_kes))
    print('-------------------------')
    
    

0 60137430710900000
time btw payments [31 34]
std of time btw payments 1.5
mean of payment amount 11233.333333333334
-------------------------
1 63136740549700000
time btw payments [30 30 31 31 30]
std of time btw payments 0.4898979485566356
mean of payment amount 2800.0
-------------------------
2 105136540140100000
time btw payments [74 30 31 31 30]
std of time btw payments 17.405746177627663
mean of payment amount 7166.666666666667
-------------------------
3 56134761927800000
time btw payments [29 62 32 28 31]
std of time btw payments 12.877888025604198
mean of payment amount 3575.0
-------------------------
4 112137283351600000
time btw payments [42]
std of time btw payments 0.0
mean of payment amount 5000.0
-------------------------
5 57134649143400000
time btw payments [28 36 27 33 30 28 28]
std of time btw payments 3.070597894314954
mean of payment amount 2050.0
-------------------------
6 105137049319900000
time btw payments [52 24 18]
std of time btw payments 14.8174071805952

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [9]:
acct_id = list(frmln_trx.new_account_ids.unique())[0]

In [10]:
loan_df = frmln_trx.loc[frmln_trx['new_account_ids']==acct_id]    
loan_df = loan_df.copy()
loan_df.sort_values("trx_stdtime_days_acc", inplace=True)

In [11]:
loan_df.trx_stdtime_days_acc.max()

129

In [12]:
loan_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,hh_ids,unique_hhs,first_trx_date_hh,last_trx_date_hh,tot_hh_daysofobs,tot_hh_monthsofobs,interview_designation,int_date,int_month,int_year,int_yr_mo,first_int_date,account_ids,new_account_ids,unique_accnts,m_ids_owner,unique_hm_owner,account_bsheet_desig,account_startclose_balance,account_formal,account_liquid,first_trx_date_acc,last_trx_date_acc,tot_acc_daysofobs,tot_acc_monthsofobs,trx_id,m_ids_trx,trx_date,trx_month,trx_year,trx_yr_mo,trx_dq_round,trx_stdtime_days_hh,trx_stdtime_mnths_hh,trx_stdtime_days_acc,trx_stdtime_mnths_acc,trx_class_code,trx_class_desc,trx_family_code,trx_family_desc,trx_type_code,trx_type_desc,trx_prx_purpose,trx_prx_purpose_fd,trx_fee,trx_bsheet_direction,trx_mode_code,trx_mode_desc,trx_place_incommunity,trx_distance_km,trx_outlet,trx_direction,trx_value_kes,trx_value_usd,ddd_gift,trx_inkind_units,trx_inkind_value_usd,trx_inkind_value_kes,trx_stdtime_mnths_hh_nr,acc_unexplained_trx,bal_evol,start_bal
436,455,7277,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,04jun2013,6,2013,2013_06,04sep2012,60137430710900000,6.01374307109e+16,1.0,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137430758700000,60134547419200000,27may2013,5,2013,2013_05,14.0,259,8,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,5. Interest accruing,,0.0,Increase,1.0,Cash,,,,,35000.0,411.7647,0,,,,8.931034,0.0,135000.0,100000.0
3138,3200,398018,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,04jun2013,6,2013,2013_06,04sep2012,60137430710900000,6.01374307109e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137430753900000,60134547419200000,27may2013,5,2013,2013_05,14.0,259,8,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,2. New borrowing,New borrowing on financial liability,0.0,Increase,1.0,Cash,1.0,0.0,"01=Bank, MFI, SACCO, insurance branch",Inflow,100000.0,1176.471,0,,,,8.931034,0.0,100000.0,100000.0
1727,1784,124091,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,23jul2013,7,2013,2013_07,04sep2012,60137430710900000,6.01374307109e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137589607700000,60134547419200000,06jul2013,7,2013,2013_07,17.0,299,10,40,1,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,09=At home or work (self or collector/agent),Outflow,11250.0,132.3529,0,,,,10.31034,0.0,123750.0,100000.0
1956,2014,176101,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,20aug2013,8,2013,2013_08,04sep2012,60137430710900000,6.01374307109e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60138373017000000,60134547419200000,06aug2013,8,2013,2013_08,19.0,330,11,71,2,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,11250.0,132.3529,0,,,,11.37931,0.0,112500.0,100000.0
2571,2632,295205,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,18sep2013,9,2013,2013_09,04sep2012,60137430710900000,6.01374307109e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60138557814100000,60134547419200000,09sep2013,9,2013,2013_09,21.0,364,12,105,3,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,09=At home or work (self or collector/agent),Outflow,11200.0,131.7647,0,,,,12.55172,0.0,101300.0,100000.0
2,2,27,KVIHK40,,10sep2012,03oct2013,388,13,06=Cleaning interview,04oct2013,10,2013,2013_10,04sep2012,60137430710900000,6.01374307109e+16,,60134547419200000,,Liability,Close,Formal,,27may2013,03oct2013,129,4,60138614254500000,60134547419200000,03oct2013,10,2013,2013_10,23.0,388,13,129,4,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,6. Closing Balance--End of last DQ,,0.0,,19.0,CLOSING BALANCE,,,,,101300.0,1191.765,0,,,,13.37931,0.0,101300.0,100000.0


Step 3: Investigate whether someone has a time period over "thresh" days, classify that loan into one category or another. 

In [13]:
# def find_borrowers_num_days_threshold(df, thresh, x_days):
#     # df is df of all trx of loans
#     # thresh is the cutoff for num days between loans 
#     # x_days is the number of days we want to look at a loan to evaluate regularity... e.g. 180. Should probably be the same 
#     # as the x_days in the next function as well. For example, if we look at 180 days, we should look at the num 
#     # times w/i that 180 days that someone paid back 180/x_days.floor() times....
    
#     loan_categories = pd.DataFrame()
#     for idx, acct_id in enumerate(list(df.new_account_ids.unique())):
#         print(idx, acct_id)
        
#         loan_df = frmln_trx.loc[frmln_trx['new_account_ids']==acct_id]    
#         loan_df = loan_df.copy()
#         loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
#         owner_id = list(loan_df.m_ids_owner)[0]
#         key1 = 'every_payment_under_'+str(thresh)+'_days'
        
#         ## Create the balance evolution: 
#         loan_df.insert(58,"trx_val_dir",np.zeros(loan_df.shape[0]))

#         for index, row in loan_df.iterrows():
#             if loan_df.loc[index,"trx_bsheet_direction"]=="Increase":
#                 loan_df.at[index,"trx_val_dir"] = loan_df.loc[index,"trx_value_kes"]
#             elif loan_df.loc[index,"trx_bsheet_direction"]=="Decrease":
#                 loan_df.at[index,"trx_val_dir"] = -loan_df.loc[index,"trx_value_kes"]
#             else:
#                 print("Error neither increase nor decrease at index" + str(index))
#         loan_df.insert(59,"bal_evol",np.zeros(loan_df.shape[0]))
        
#         s=0
#         for index,row in loan_df.iterrows():
#             s+=loan_df.loc[index,"trx_val_dir"]
#             loan_df.at[index,"bal_evol"]=s    
        
        
#         #subset down only to payments
#         payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
#         payment_df= loan_df.iloc[payment_idxs]
        
        
#         # num times they should have paid in x_days
#         num_payments = len(payment_idxs)
#         num_times_should_have_paid = math.floor(x_days/thresh)
#         key2 = 'paid_'+str(num_times_should_have_paid)+"_times_in_"+str(x_days)
        
#         # calculate the array of time between payments: 
#         time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
        
#         print('time btw payments', time_btw_payments)
#         print('std of time btw payments', np.std(time_btw_payments))
#         print('mean of payment amount', np.mean(payment_df.trx_value_kes))
#         print('-------------------------')

#         ints_over_thresh = [x for x in  time_btw_payments if x > thresh]
        
#         if len(ints_over_thresh)>0: 
#             if num_payments >= num_times_should_have_paid: 
#                 loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 0, key2: 1}
#             else: 
#                 loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 0, key2: 0}
#         else: 
#             if num_payments >= num_times_should_have_paid:
#                 loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 1, key2: 1}
#             else: 
#                 loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 0, key2: 0}
#         loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
#     return loan_categories

In [33]:
def find_borrowers_paid_back_in_x_days(df, x_days):
    loan_categories = pd.DataFrame()
    for idx, acct_id in enumerate(list(df.new_account_ids.unique())):
        print(idx, acct_id)
        
        loan_df = frmln_trx.loc[frmln_trx['new_account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]

        days_observed = loan_df.trx_stdtime_days_acc.max()
        key = str('paid_back_in_'+ str(x_days)+'_days')
        
        if days_observed >= x_days: 
            # if the num days observed loan is greater than the num days we are looking at (e.g. 150 for 5 mos)
            # then we want to see if they paid back at the last trx before the x_days are up, 
            loan_x_days = loan_df.loc[loan_df['trx_stdtime_days_acc'] <= x_days]
        else:
            loan_x_days = loan_df
        
        if len(loan_x_days.bal_evol) > 0:
            balance_at_x_days = list(loan_x_days.bal_evol)[-1]
#             print(balance_at_x_days)

            if balance_at_x_days <= 0: 
#                 print('less than 0!')
                loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 1}
            else: 
#                 print('gt than 0!')
                loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 0}
        else:
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 'too_few_borrowings'}
        
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    return loan_categories

In [38]:
borrowers_paid_back_in_150_days_df = find_borrowers_paid_back_in_x_days(frmln_trx, 150)

0 60137430710900000.1
1 63136740549700000.1
2 105136540140100000.1
3 56134761927800000.1
4 112137283351600000.1
5 57134649143400000.1
6 105137049319900000.1
7 57136557858500000.2
8 57134702786200000.1
9 61134770599600000.1
10 57136539616900000.1
11 57136557858500000.1
12 61135352095000000.1
13 59134726342000000.2
14 59134753176900000.1
15 59134691680100000.2
16 84136886562100000.1
17 105137769890900000.1
18 59134745004500000.1
19 57134726357400000.1
20 59135021173400000.2
21 57135920565300000.1
22 60136531885600000.2
23 59136998025000000.1
24 59136376033300000.1
25 61134830133700000.3
26 62135858405800000.0
27 61134797983800000.1
28 50136756070500000.1
29 60134752045800000.1
30 105137414874400000.1
31 58134763288300000.1
32 61134804533700000.1
33 61134798041100000.1
34 61135010791300000.1
35 59136685821800000.1
36 61134770599600000.2
37 61136428866300000.1
38 61135409324300000.3
39 63136075164500000.1
40 63134804220400000.1
41 63134734786300000.2
42 63134734844700000.1
43 5913494257660

In [39]:
borrowers_paid_back_in_150_days_df

Unnamed: 0,m_ids_owner,new_account_ids,paid_back_in_150_days
0,60134547419200000,6.01374307109e+16,0
1,63134425702500000,6.31367405497e+16,0
2,65134432186900000,1.051365401401e+17,1
3,65134442822400000,5.61347619278e+16,0
4,61134406652800000,1.121372833516e+17,0
5,57134407687800000,5.71346491434e+16,0
6,56134397318700000,1.051370493199e+17,0
7,57134320447800000,5.71365578585e+16,0
8,57134339147100000,5.71347027862e+16,0
9,61134406243400000,6.11347705996e+16,0


In [40]:
def find_borrowers_num_days_threshold(df, borrowers_paid_back_in_x_days_df, thresh, x_days):
    
    # df is df of all trx of loans
    # thresh is the cutoff for num days between loans 
    # x_days = days of observation of the loan 

    loan_categories = pd.DataFrame()
    
    for idx, acct_id in enumerate(list(df.new_account_ids.unique())):
        
        print(idx, acct_id)
        
        loan_df = frmln_trx.loc[frmln_trx['new_account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]
        key1 = 'every_payment_under_'+str(thresh)+'_days'
        

        #subset down only to payments
        payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
        payment_df= loan_df.iloc[payment_idxs]
        
        # calculate the array of time between payments: 
        time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
        
        # starting time 
        starting_time = list(loan_df.trx_stdtime_days_acc)[0]
        ending_time = list(loan_df.trx_stdtime_days_acc)[-1]
        num_to_add_to_time_btw_payments = x_days+starting_time - ending_time
        
        key_lookup = 'paid_back_in_'+str(x_days)+'_days'
        
        paid_back_x_days = borrowers_paid_back_in_150_days_df.loc[borrowers_paid_back_in_150_days_df['account_ids']==acct_id][key_lookup]
        
        if paid_back_x_days == 0: 
            np.append(time_btw_payments, num_to_add_to_time_btw_payments)
        else: 
            time_btw_payments = time_btw_payments
        
        print('time btw payments', time_btw_payments)
        print('std of time btw payments', np.std(time_btw_payments))
        print('mean of payment amount', np.mean(payment_df.trx_value_kes))
        print('-------------------------')

        ints_over_thresh = [x for x in  time_btw_payments if x > thresh]
        
        if len(ints_over_thresh)>0: 
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 0}
        else: 
            loan_categories_line = {'new_account_ids': str(acct_id), 'm_ids_owner': owner_id, key1: 1}
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    return loan_categories

In [41]:
z = x.merge(y, 'outer', ['new_account_ids', 'm_ids_owner'])

In [42]:
z.shape

(106, 4)

In [44]:
def create_outcomes(df, thresh, x_days): 
    
    x = find_borrowers_paid_back_in_x_days(df, x_days)
    y = find_borrowers_num_days_threshold(df, x, thresh, x_days)
    
    
    z = x.merge(y, 'outer', ['new_account_ids', 'm_ids_owner'])
    return z 

In [45]:
outcomes = create_outcomes(frmln_trx, 33, 150)

0 60137430710900000.1
1 63136740549700000.1
2 105136540140100000.1
3 56134761927800000.1
4 112137283351600000.1
5 57134649143400000.1
6 105137049319900000.1
7 57136557858500000.2
8 57134702786200000.1
9 61134770599600000.1
10 57136539616900000.1
11 57136557858500000.1
12 61135352095000000.1
13 59134726342000000.2
14 59134753176900000.1
15 59134691680100000.2
16 84136886562100000.1
17 105137769890900000.1
18 59134745004500000.1
19 57134726357400000.1
20 59135021173400000.2
21 57135920565300000.1
22 60136531885600000.2
23 59136998025000000.1
24 59136376033300000.1
25 61134830133700000.3
26 62135858405800000.0
27 61134797983800000.1
28 50136756070500000.1
29 60134752045800000.1
30 105137414874400000.1
31 58134763288300000.1
32 61134804533700000.1
33 61134798041100000.1
34 61135010791300000.1
35 59136685821800000.1
36 61134770599600000.2
37 61136428866300000.1
38 61135409324300000.3
39 63136075164500000.1
40 63134804220400000.1
41 63134734786300000.2
42 63134734844700000.1
43 5913494257660

KeyError: 'account_ids'

In [46]:
outcomes.head(20)

Unnamed: 0,every_payment_under_33_days,m_ids_owner,new_account_ids,paid_back_in_150_days
0,0.0,60134547419200000,6.01374307109e+16,0
1,1.0,63134425702500000,6.31367405497e+16,0
2,1.0,65134432186900000,1.051365401401e+17,1
3,0.0,65134442822400000,5.61347619278e+16,0
4,0.0,61134406652800000,1.121372833516e+17,0
5,0.0,57134407687800000,5.71346491434e+16,0
6,0.0,56134397318700000,1.051370493199e+17,0
7,1.0,57134320447800000,5.71365578585e+16,0
8,0.0,57134339147100000,5.71347027862e+16,0
9,1.0,61134406243400000,6.11347705996e+16,0


In [28]:
outcomes.to_csv('everyPaymentUnder33_paidWithin150.csv', index = False)