In [1]:
import os
import pandas as pd 
import glob
import random 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 


In [2]:
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',680)

## Read in data: 

In [5]:
os.listdir()

['numPeople_MultipleLoans.ipynb',
 '.DS_Store',
 'graph_loans_bus_ag_julia.ipynb',
 'different_loan_types_julia.ipynb',
 'Demographics_Education_Julia.ipynb',
 'truncating_loans_bus_ag_julia.ipynb',
 'diaries_trx_trunc_loans.csv',
 'one_time_payments_julia.ipynb',
 'metrics_oneTimePayments.ipynb',
 'Truncation_Ella-Julia.ipynb',
 'consumption_julia.ipynb',
 'steady_payments_or_not.ipynb',
 '.ipynb_checkpoints',
 'graphs',
 'steady_payments_or_not-pureTruncElla.ipynb',
 'Pure_Truncation_Ella-Julia.ipynb',
 'initial_exploration_julia.ipynb']

In [6]:
trx = pd.read_csv('diaries_trx_trunc_loans.csv', low_memory = False)

##  Formal Loans (FRMLN)

Step 1: Subset to loans w/ FRMLN designation: 

In [7]:
frmln_trx = trx.loc[trx['trx_family_code']=="FRMLN"]
frmln_trx.shape

(733, 61)

Step 2: What are the different types of formal loans and how many are there? 

In [8]:
frmln_trx.loc[frmln_trx['unique_accnts']==1].trx_type_desc.value_counts()

Individual Business or Agriculture Loan      36
Consumer/ personal loan (not payday loan)    28
Joint liability loan                         17
Name: trx_type_desc, dtype: int64

In [9]:
frmln_trx.loc[frmln_trx['unique_accnts']==1].trx_type_code.value_counts()

2760    36
2762    28
3247    17
Name: trx_type_code, dtype: int64

Step 3: Observe the graphs of each kind of loan. 

## Notes on the graphs: 
1. Bus-ag (>250), cons-pers (>100) are good loans to use with a few exceptions.
2. If we remove 3 trx hier_pur, these are pretty good loans
3. Same as (2) with Joint liability
4. Probably shouldn't use M-SHWARI, low duration and very erratic, many have only 2 data points? 
5. School fees are useful
6. Group enterprise is fine to use; only one though

Step 2: Let's look at standard deviations of the payment amounts: 

In [13]:
for idx, acct_id in enumerate(list(frmln_trx.account_ids.unique())):
    print(idx, acct_id)
    loan_df = frmln_trx.loc[frmln_trx['account_ids']==acct_id]    
    loan_df = loan_df.copy()
    loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
    
    #subset down only to payments
    payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
    payment_df= loan_df.iloc[payment_idxs]
    
    #figure out a vector of time between payments
    time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))
    
    print('time btw payments', time_btw_payments)
    print('std of time btw payments', np.std(time_btw_payments))
    print('mean of payment amount', np.mean(payment_df.trx_value_kes))
    print('-------------------------')
    
    

0 60137430710900000
time btw payments [31 34]
std of time btw payments 1.5
mean of payment amount 11233.333333333334
-------------------------
1 63136740549700000
time btw payments [30 30 31 31 30]
std of time btw payments 0.4898979485566356
mean of payment amount 2800.0
-------------------------
2 105136540140100000
time btw payments [74 30 31 31 30]
std of time btw payments 17.405746177627663
mean of payment amount 7166.666666666667
-------------------------
3 56134761927800000
time btw payments [29 62 32 28 31]
std of time btw payments 12.877888025604198
mean of payment amount 3575.0
-------------------------
4 112137283351600000
time btw payments [42]
std of time btw payments 0.0
mean of payment amount 5000.0
-------------------------
5 57134649143400000
time btw payments [28 36 27 33 30 28 28]
std of time btw payments 3.070597894314954
mean of payment amount 2050.0
-------------------------
6 105137049319900000
time btw payments [52 24 18]
std of time btw payments 14.8174071805952

time btw payments [33  0]
std of time btw payments 16.5
mean of payment amount 3333.3333333333335
-------------------------
80 60136500338400000
time btw payments []
std of time btw payments nan
mean of payment amount 2000.0
-------------------------


  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


In [30]:
acct_id = list(frmln_trx.account_ids.unique())[0]

In [32]:
loan_df = frmln_trx.loc[frmln_trx['account_ids']==acct_id]    
loan_df = loan_df.copy()
loan_df.sort_values("trx_stdtime_days_acc", inplace=True)

In [34]:
loan_df.trx_stdtime_days_acc.max()

129

In [57]:
loan_df

Unnamed: 0.1,Unnamed: 0,hh_ids,unique_hhs,first_trx_date_hh,last_trx_date_hh,tot_hh_daysofobs,tot_hh_monthsofobs,interview_designation,int_date,int_month,int_year,int_yr_mo,first_int_date,account_ids,new_account_ids,unique_accnts,m_ids_owner,unique_hm_owner,account_bsheet_desig,account_startclose_balance,account_formal,account_liquid,first_trx_date_acc,last_trx_date_acc,tot_acc_daysofobs,tot_acc_monthsofobs,trx_id,m_ids_trx,trx_date,trx_month,trx_year,trx_yr_mo,trx_dq_round,trx_stdtime_days_hh,trx_stdtime_mnths_hh,trx_stdtime_days_acc,trx_stdtime_mnths_acc,trx_class_code,trx_class_desc,trx_family_code,trx_family_desc,trx_type_code,trx_type_desc,trx_prx_purpose,trx_prx_purpose_fd,trx_fee,trx_bsheet_direction,trx_mode_code,trx_mode_desc,trx_place_incommunity,trx_distance_km,trx_outlet,trx_direction,trx_value_kes,trx_value_usd,ddd_gift,trx_inkind_units,trx_inkind_value_usd,trx_inkind_value_kes,trx_stdtime_mnths_hh_nr,acc_unexplained_trx
185,7277,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,04jun2013,6,2013,2013_06,04sep2012,60137430710900000,6.013743e+16,1.0,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137430758700000,60134547419200000,27may2013,5,2013,2013_05,14.0,259,8,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,5. Interest accruing,,0.0,Increase,1.0,Cash,,,,,35000.0,411.7647,0,,,,8.931034,0.0
1435,398018,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,04jun2013,6,2013,2013_06,04sep2012,60137430710900000,6.013743e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137430753900000,60134547419200000,27may2013,5,2013,2013_05,14.0,259,8,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,2. New borrowing,New borrowing on financial liability,0.0,Increase,1.0,Cash,1.0,0.0,"01=Bank, MFI, SACCO, insurance branch",Inflow,100000.0,1176.471,0,,,,8.931034,0.0
744,124091,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,23jul2013,7,2013,2013_07,04sep2012,60137430710900000,6.013743e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60137589607700000,60134547419200000,06jul2013,7,2013,2013_07,17.0,299,10,40,1,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,09=At home or work (self or collector/agent),Outflow,11250.0,132.3529,0,,,,10.31034,0.0
861,176101,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,20aug2013,8,2013,2013_08,04sep2012,60137430710900000,6.013743e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60138373017000000,60134547419200000,06aug2013,8,2013,2013_08,19.0,330,11,71,2,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,11250.0,132.3529,0,,,,11.37931,0.0
1169,295205,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,18sep2013,9,2013,2013_09,04sep2012,60137430710900000,6.013743e+16,,60134547419200000,,Liability,,Formal,,27may2013,03oct2013,129,4,60138557814100000,60134547419200000,09sep2013,9,2013,2013_09,21.0,364,12,105,3,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,09=At home or work (self or collector/agent),Outflow,11200.0,131.7647,0,,,,12.55172,0.0
1,27,KVIHK40,,10sep2012,03oct2013,388,13,06=Cleaning interview,04oct2013,10,2013,2013_10,04sep2012,60137430710900000,6.013743e+16,,60134547419200000,,Liability,Close,Formal,,27may2013,03oct2013,129,4,60138614254500000,60134547419200000,03oct2013,10,2013,2013_10,23.0,388,13,129,4,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,6. Closing Balance--End of last DQ,,0.0,,19.0,CLOSING BALANCE,,,,,101300.0,1191.765,0,,,,13.37931,0.0


Step 3: Investigate whether someone has a time period over "num_days" days, classify that loan into one category or another. 

In [38]:
def find_borrowers_num_days_threshold(df, day_nums):
    loan_categories = pd.DataFrame()
    for idx, acct_id in enumerate(list(df.account_ids.unique())):
        print(idx, acct_id)
        
        loan_df = frmln_trx.loc[frmln_trx['account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]

        #subset down only to payments
        payment_idxs = [idx for idx, purpose in enumerate(list(loan_df['trx_prx_purpose'])) if 'Payment' in purpose]
        payment_df= loan_df.iloc[payment_idxs]

        #figure out a vector of time between payments
        time_btw_payments = np.diff(np.array(payment_df.trx_stdtime_days_acc))

        print('time btw payments', time_btw_payments)
        print('std of time btw payments', np.std(time_btw_payments))
        print('mean of payment amount', np.mean(payment_df.trx_value_kes))
        print('-------------------------')

        ints_over_day_nums = [x for x in  time_btw_payments if x > day_nums]
        
        if len(ints_over_day_nums)>0: 
            loan_categories_line = {'account_ids': acct_id, 'm_ids_owner': owner_id, 'borrower_under_num_days': 0}
#             loan_categories.append(loan_categories_line, ignore_index = True)
        else: 
            loan_categories_line = {'account_ids': acct_id, 'm_ids_owner': owner_id, 'borrower_under_num_days': 1}
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    return loan_categories

In [59]:
len(frmln_trx.account_ids.unique())

81

In [60]:
x = find_borrowers_num_days_threshold(frmln_trx, 33)

0 60137430710900000
time btw payments [31 34]
std of time btw payments 1.5
mean of payment amount 11233.333333333334
-------------------------
1 63136740549700000
time btw payments [30 30 31 31 30]
std of time btw payments 0.4898979485566356
mean of payment amount 2800.0
-------------------------
2 105136540140100000
time btw payments [74 30 31 31 30]
std of time btw payments 17.405746177627663
mean of payment amount 7166.666666666667
-------------------------
3 56134761927800000
time btw payments [29 62 32 28 31]
std of time btw payments 12.877888025604198
mean of payment amount 3575.0
-------------------------
4 112137283351600000
time btw payments [42]
std of time btw payments 0.0
mean of payment amount 5000.0
-------------------------
5 57134649143400000
time btw payments [28 36 27 33 30 28 28]
std of time btw payments 3.070597894314954
mean of payment amount 2050.0
-------------------------
6 105137049319900000
time btw payments [52 24 18]
std of time btw payments 14.8174071805952

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


 [35 28 35 35 28 28 28 35 28  2 26 35]
std of time btw payments 8.760311384623012
mean of payment amount 1881.5384615384614
-------------------------
44 60138558075900000
time btw payments []
std of time btw payments nan
mean of payment amount nan
-------------------------
45 63134734844700000
time btw payments [35 21 42 28]
std of time btw payments 7.826237921249264
mean of payment amount 1720.0
-------------------------
46 59134942576600000
time btw payments [ 37  36  51 117  13  32  24]
std of time btw payments 31.61244911068281
mean of payment amount 5200.0
-------------------------
47 59134942516400000
time btw payments [ 37  29  61  61 135]
std of time btw payments 37.446495163099044
mean of payment amount 2333.3333333333335
-------------------------
48 59135900709000000
time btw payments [118  61  68]
std of time btw payments 25.381533094401966
mean of payment amount 1050.0
-------------------------
49 61134788847800000
time btw payments []
std of time btw payments nan
mean of p

In [95]:
def find_borrowers_paid_back_in_x_days(df, x_days):
    loan_categories = pd.DataFrame()
    for idx, acct_id in enumerate(list(df.account_ids.unique())):
        print(idx, acct_id)
        
        loan_df = frmln_trx.loc[frmln_trx['account_ids']==acct_id]    
        loan_df = loan_df.copy()
        loan_df.sort_values("trx_stdtime_days_acc", inplace=True)
        owner_id = list(loan_df.m_ids_owner)[0]

        days_observed = loan_df.trx_stdtime_days_acc.max()
        key = str('paid_back_in_'+ str(x_days)+'_days')
        
        loan_df.insert(58,"trx_val_dir",np.zeros(loan_df.shape[0]))

        for index, row in loan_df.iterrows():
            if loan_df.loc[index,"trx_bsheet_direction"]=="Increase":
                loan_df.at[index,"trx_val_dir"] = loan_df.loc[index,"trx_value_kes"]
            elif loan_df.loc[index,"trx_bsheet_direction"]=="Decrease":
                loan_df.at[index,"trx_val_dir"] = -loan_df.loc[index,"trx_value_kes"]
            else:
                print("Error neither increase nor decrease at index" + str(index))
        loan_df.insert(59,"bal_evol",np.zeros(loan_df.shape[0]))
        
        s=0
        for index,row in loan_df.iterrows():
            s+=loan_df.loc[index,"trx_val_dir"]
            loan_df.at[index,"bal_evol"]=s
        
        
        if days_observed >= x_days: 
            # if the num days observed loan is greater than the num days we are looking at (e.g. 150 for 5 mos)
            # then we want to see if they paid back at the last trx before the x_days are up, 
            loan_x_days = loan_df.loc[loan_df['trx_stdtime_days_acc'] <= x_days]
        else:
            loan_x_days = loan_df

        balance_at_x_days = list(loan_x_days.bal_evol)[-1]
        print(balance_at_x_days)
        
        if balance_at_x_days <= 0: 
            print('less than 0!')
            loan_categories_line = {'account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 1}
        else: 
            print('gt than 0!')
            loan_categories_line = {'account_ids': str(acct_id), 'm_ids_owner': owner_id, key: 0}
        
        loan_categories = loan_categories.append(loan_categories_line, ignore_index = True)        
    print(loan_categories)
    return loan_categories

In [96]:
y = find_borrowers_paid_back_in_x_days(frmln_trx, 150)

0 60137430710900000
Error neither increase nor decrease at index1
101300.0
gt than 0!
1 63136740549700000
Error neither increase nor decrease at index44
2800.0
gt than 0!
2 105136540140100000
Error neither increase nor decrease at index413
45000.0
gt than 0!
3 56134761927800000
Error neither increase nor decrease at index27
2720.0
gt than 0!
4 112137283351600000
Error neither increase nor decrease at index8
51200.0
gt than 0!
5 57134649143400000
Error neither increase nor decrease at index436
6150.0
gt than 0!
6 105137049319900000
Error neither increase nor decrease at index69
253608.0
gt than 0!
7 57136557858500000
Error neither increase nor decrease at index341
3900.0
gt than 0!
8 57134702786200000
Error neither increase nor decrease at index509
32500.0
gt than 0!
9 61134770599600000
Error neither increase nor decrease at index423
40000.0
gt than 0!
10 57136539616900000
Error neither increase nor decrease at index33
40000.0
gt than 0!
11 61135352095000000
Error neither increase nor d

In [97]:
y

Unnamed: 0,account_ids,m_ids_owner,paid_back_in_150_days
0,60137430710900000,60134547419200000,0.0
1,63136740549700000,63134425702500000,0.0
2,105136540140100000,65134432186900000,0.0
3,56134761927800000,65134442822400000,0.0
4,112137283351600000,61134406652800000,0.0
5,57134649143400000,57134407687800000,0.0
6,105137049319900000,56134397318700000,0.0
7,57136557858500000,57134320447800000,0.0
8,57134702786200000,57134339147100000,0.0
9,61134770599600000,61134406243400000,0.0
