In [1]:
import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline 

In [2]:
# to see all the columns when using head() or describe()
pd.set_option('display.max_columns',60)

In [3]:
df = pd.read_csv("kenya_data/diaries_transactions_all.csv", dtype={'account_startclose_balance': str})

In [5]:
dfb = df[(df["trx_family_code"]=="FRMLN")&((df["trx_type_code"]==2760)|(df["trx_type_code"]==3247))]

# Get rid of loans of with unexplained balance adjustments

In [6]:
dfb["trx_prx_purpose"].unique()

array(['6. Closing Balance--End of last DQ',
       '1. Starting balance (today)', '5. Interest accruing',
       '3. Payments', 'INCREASE--Unexplained balance adjustment',
       '2. New borrowinng (individual portion)', '4. Any known fees',
       'DECREASE--Unexplained balance adjustment',
       '7. Refund from lender', '2. New borrowing'], dtype=object)

We insert a new column that is 0 if there is no unexplained balance adjustment for this account and a 1 if there is one. The column in 1 or 0 for all entries that correspond to that account.

In [7]:
dfb.insert(dfb.shape[1], "acc_unexplained_trx", np.zeros(dfb.shape[0]))

This program goes through all unique accounts, checks whether it has an unexplained balance adjustment and if yes sets the variable u to 1. Then it goes through another for-loop and sets the entries of that column 0 for all accounts that have an unexplained balance adjustment.

In [9]:
for acc in dfb["account_ids"].unique():
    u = 0
    for index,row in dfb[dfb["account_ids"]==acc].iterrows():
        if row["trx_prx_purpose"]=="DECREASE--Unexplained balance adjustment" or row["trx_prx_purpose"]=="INCREASE--Unexplained balance adjustment":
            u = 1
    for index,row in dfb[dfb["account_ids"]==acc].iterrows():
        dfb.at[index,"acc_unexplained_trx"] = u

### Let us check if it worked.

In [19]:
s = set()
for acc in dfb[dfb["acc_unexplained_trx"]==0]["account_ids"].unique():
     for i in dfb[dfb["account_ids"]==acc]["trx_prx_purpose"].unique():
            s.add(i)
print(s)

{'4. Any known fees', '2. New borrowing', '3. Payments', '5. Interest accruing', '7. Refund from lender', '6. Closing Balance--End of last DQ', '1. Starting balance (today)', '2. New borrowinng (individual portion)'}


### Let's now only use the loans that have no unexplained balance adjustments.

In [75]:
dfb = dfb[dfb["acc_unexplained_trx"]==0]

# Truncation into chunks with no new borrowing

We first insert a column for the new account numbers. These will be like the old account numbers but with a .1 or .2 for each sequence after a new borrowing.

In [125]:
del dfb["new_account_ids"]

In [126]:
dfb.insert(df.columns.get_loc("account_ids")+1,"new_account_ids", ["-"]*(dfb.shape[0]))

In [127]:
dfb.sample(1)

Unnamed: 0,hh_ids,unique_hhs,first_trx_date_hh,last_trx_date_hh,tot_hh_daysofobs,tot_hh_monthsofobs,interview_designation,int_date,int_month,int_year,int_yr_mo,first_int_date,account_ids,new_account_ids,unique_accnts,m_ids_owner,unique_hm_owner,account_bsheet_desig,account_startclose_balance,account_formal,account_liquid,first_trx_date_acc,last_trx_date_acc,tot_acc_daysofobs,tot_acc_monthsofobs,trx_id,m_ids_trx,trx_date,trx_month,trx_year,trx_yr_mo,trx_dq_round,trx_stdtime_days_hh,trx_stdtime_mnths_hh,trx_stdtime_days_acc,trx_stdtime_mnths_acc,trx_class_code,trx_class_desc,trx_family_code,trx_family_desc,trx_type_code,trx_type_desc,trx_prx_purpose,trx_prx_purpose_fd,trx_fee,trx_bsheet_direction,trx_mode_code,trx_mode_desc,trx_place_incommunity,trx_distance_km,trx_outlet,trx_direction,trx_value_kes,trx_value_usd,ddd_gift,trx_inkind_units,trx_inkind_value_usd,trx_inkind_value_kes,trx_stdtime_mnths_hh_nr,acc_unexplained_trx
398120,KVIHK40,,10sep2012,03oct2013,388,13,04=Diaries Interview,19mar2013,3,2013,2013_03,04sep2012,60134978362300000,-,,60134547419200000,,Liability,,Formal,,02oct2012,03oct2013,366,12,60136369687100000,60134547419200000,08mar2013,3,2013,2013_03,9.0,179,6,157,5,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,07=Group meeting,Outflow,5500.0,64.70588,0,,,,6.172414,0.0


In [157]:
acc = random.choice(dfb[dfb["trx_prx_purpose"]=="2. New borrowing"]["account_ids"].unique())

In [158]:
print(acc)

60135893778700000


acc = 59135900709000000

In [214]:
lna = dfb[dfb["account_ids"]==acc]

In [215]:
lna = lna.sort_values("trx_stdtime_days_acc")

In [216]:
lna

Unnamed: 0,hh_ids,unique_hhs,first_trx_date_hh,last_trx_date_hh,tot_hh_daysofobs,tot_hh_monthsofobs,interview_designation,int_date,int_month,int_year,int_yr_mo,first_int_date,account_ids,new_account_ids,unique_accnts,m_ids_owner,unique_hm_owner,account_bsheet_desig,account_startclose_balance,account_formal,account_liquid,first_trx_date_acc,last_trx_date_acc,tot_acc_daysofobs,tot_acc_monthsofobs,trx_id,m_ids_trx,trx_date,trx_month,trx_year,trx_yr_mo,trx_dq_round,trx_stdtime_days_hh,trx_stdtime_mnths_hh,trx_stdtime_days_acc,trx_stdtime_mnths_acc,trx_class_code,trx_class_desc,trx_family_code,trx_family_desc,trx_type_code,trx_type_desc,trx_prx_purpose,trx_prx_purpose_fd,trx_fee,trx_bsheet_direction,trx_mode_code,trx_mode_desc,trx_place_incommunity,trx_distance_km,trx_outlet,trx_direction,trx_value_kes,trx_value_usd,ddd_gift,trx_inkind_units,trx_inkind_value_usd,trx_inkind_value_kes,trx_stdtime_mnths_hh_nr,acc_unexplained_trx
260932,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,23jan2013,1,2013,2013_01,27aug2012,60135893778700000,-,1.0,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60135893829500000,60134365798400000,08jan2013,1,2013,2013_01,7.0,105,3,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,500.0,5.882353,0,Farm inputs,74.11765,6300.0,3.62069,0.0
281777,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,23jan2013,1,2013,2013_01,27aug2012,60135893778700000,-,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60135893834700000,HH,08jan2013,1,2013,2013_01,7.0,105,3,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,2. New borrowing,New borrowing on financial liability,0.0,Increase,10.0,"In-kind (trade, good, service--NOT MONEY)",1.0,0.0,"12=Other institution (school, clinic, church)",Inflow,6500.0,76.47059,0,Farm inputs,76.47059,6500.0,3.62069,0.0
400489,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,-,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354053600000,60134365798400000,05mar2013,3,2013,2013_03,22.0,161,5,56,1,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,5.551724,0.0
194824,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,-,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354057900000,60134365798400000,07may2013,5,2013,2013_05,22.0,224,7,119,4,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,7.724138,0.0
275798,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,-,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354057900000,60134365798400000,25jun2013,6,2013,2013_06,22.0,273,9,168,5,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,9.413794,0.0
15825,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,25sep2013,9,2013,2013_09,27aug2012,60135893778700000,-,,HH,,Liability,Close,Formal,,08jan2013,25sep2013,260,8,60138045552400000,60134365798400000,25sep2013,9,2013,2013_09,21.0,365,12,260,8,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,6. Closing Balance--End of last DQ,,0.0,,19.0,CLOSING BALANCE,,,,,0.0,0.0,0,,,,12.58621,0.0


## Now that I see this account I realize that we actually need to consider that there might be transactions on the same day. So we need to check whether there is a new borrowing on any given day, not any given row.

In [217]:
a = lna["account_ids"].unique()[0]
print(type(a))
a = a+".1"
print(a)
print('{:.20f}'.format(float(a)))

<class 'str'>
60135893778700000.1
60135893778700000.00000000000000000000


For some reason the program does not account for the .1 in the float. Maybe it cannot handle that many digits? But I guess it is fine if we have the account numbers as strings.

In [218]:
dfb["trx_prx_purpose"].unique()

array(['6. Closing Balance--End of last DQ',
       '1. Starting balance (today)', '5. Interest accruing',
       '2. New borrowinng (individual portion)', '3. Payments',
       '4. Any known fees', '7. Refund from lender', '2. New borrowing'],
      dtype=object)

In [219]:
# we set the first account number to be the old account number.1
# the subsequent chunks will be called #.2, #.3, etc
i=1
acc = lna["account_ids"].unique()[0]
a = lna["account_ids"].unique()[0]+".1"
for index, row in lna.iterrows():
    if row["trx_prx_purpose"] == "2. New borrowinng (individual portion)" or row["trx_prx_purpose"] == "2. New borrowing":
        i +=1
        a = acc+"."+str(i)
        lna.at[index,"new_account_ids"]=a
    else:
         lna.at[index,"new_account_ids"]=a
        

In [220]:
lna[["account_ids","new_account_ids","trx_stdtime_days_acc","trx_prx_purpose","acc_unexplained_trx"]]

Unnamed: 0,account_ids,new_account_ids,trx_stdtime_days_acc,trx_prx_purpose,acc_unexplained_trx
260932,60135893778700000,6.01358937787e+16,0,3. Payments,0.0
281777,60135893778700000,6.01358937787e+16,0,2. New borrowing,0.0
400489,60135893778700000,6.01358937787e+16,56,3. Payments,0.0
194824,60135893778700000,6.01358937787e+16,119,3. Payments,0.0
275798,60135893778700000,6.01358937787e+16,168,3. Payments,0.0
15825,60135893778700000,6.01358937787e+16,260,6. Closing Balance--End of last DQ,0.0


In [221]:
lna["new_account_ids"].unique()

array(['60135893778700000.1', '60135893778700000.2'], dtype=object)

### Now let's adjust this such that new borrowings are always the first thing on one day.

In [240]:
for acc in dfb["account_ids"]:
    i=0
    indic = []
    index_bor = 0
    lna = dfb[dfb["account_ids"]==acc]
    lna = lna.sort_values("trx_stdtime_days_acc")
    for d in lna["trx_stdtime_days_acc"].unique():
        if lna[lna["trx_stdtime_days_acc"]==d].shape[0]==1:
            indic.append(lna[lna["trx_stdtime_days_acc"]==d].index[0])
        elif lna[lna["trx_stdtime_days_acc"]==d].shape[0]>1:
            if lna[(lna["trx_stdtime_days_acc"]==d)&((lna["trx_prx_purpose"]=="2. New borrowing")|(lna["trx_prx_purpose"]=="2. New borrowing (individual portion)")|(lna["trx_prx_purpose"]=="1. Starting balance (today)"))].shape[0]>1:
                print("Check manually: several new borrowings on the same day.")
            elif lna[(lna["trx_stdtime_days_acc"]==d)&((lna["trx_prx_purpose"]=="2. New borrowing")|(lna["trx_prx_purpose"]=="2. New borrowing (individual portion)")|(lna["trx_prx_purpose"]=="1. Starting balance (today)"))].shape[0]==1:
                for kind in ["2. New borrowing","2. New borrowing (individual portion)","1. Starting balance (today)"]:
                    if kind in lna[lna["trx_stdtime_days_acc"]==d]["trx_prx_purpose"].unique():
                        index_bor = lna[(lna["trx_stdtime_days_acc"]==d) & (lna["trx_prx_purpose"]==kind)].index[0]
                        indic.append(index_bor)
                        for i in lna[lna["trx_stdtime_days_acc"]==d].index.drop(index_bor):
                           indic.append(i)
            else:
                for i in lna[lna["trx_stdtime_days_acc"]==d].index:
                    indic.append(i)
    lna = lna.loc[indic,]
    v=0
    acc = lna["account_ids"].unique()[0]
    a = lna["account_ids"].unique()[0]+".1"
    for index, row in lna.iterrows():
        if row["trx_prx_purpose"] == "2. New borrowinng (individual portion)" or row["trx_prx_purpose"] == "2. New borrowing":
            v +=1
            a = acc+"."+str(v)
            dfb.at[index,"new_account_ids"]=a
        else:
             dfb.at[index,"new_account_ids"]=a

In [243]:
dfb[dfb["account_ids"]=="60135893778700000"].sort_values("trx_stdtime_days_acc")

Unnamed: 0,hh_ids,unique_hhs,first_trx_date_hh,last_trx_date_hh,tot_hh_daysofobs,tot_hh_monthsofobs,interview_designation,int_date,int_month,int_year,int_yr_mo,first_int_date,account_ids,new_account_ids,unique_accnts,m_ids_owner,unique_hm_owner,account_bsheet_desig,account_startclose_balance,account_formal,account_liquid,first_trx_date_acc,last_trx_date_acc,tot_acc_daysofobs,tot_acc_monthsofobs,trx_id,m_ids_trx,trx_date,trx_month,trx_year,trx_yr_mo,trx_dq_round,trx_stdtime_days_hh,trx_stdtime_mnths_hh,trx_stdtime_days_acc,trx_stdtime_mnths_acc,trx_class_code,trx_class_desc,trx_family_code,trx_family_desc,trx_type_code,trx_type_desc,trx_prx_purpose,trx_prx_purpose_fd,trx_fee,trx_bsheet_direction,trx_mode_code,trx_mode_desc,trx_place_incommunity,trx_distance_km,trx_outlet,trx_direction,trx_value_kes,trx_value_usd,ddd_gift,trx_inkind_units,trx_inkind_value_usd,trx_inkind_value_kes,trx_stdtime_mnths_hh_nr,acc_unexplained_trx
260932,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,23jan2013,1,2013,2013_01,27aug2012,60135893778700000,6.01358937787e+16,1.0,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60135893829500000,60134365798400000,08jan2013,1,2013,2013_01,7.0,105,3,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,500.0,5.882353,0,Farm inputs,74.11765,6300.0,3.62069,0.0
281777,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,23jan2013,1,2013,2013_01,27aug2012,60135893778700000,6.01358937787e+16,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60135893834700000,HH,08jan2013,1,2013,2013_01,7.0,105,3,0,0,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,2. New borrowing,New borrowing on financial liability,0.0,Increase,10.0,"In-kind (trade, good, service--NOT MONEY)",1.0,0.0,"12=Other institution (school, clinic, church)",Inflow,6500.0,76.47059,0,Farm inputs,76.47059,6500.0,3.62069,0.0
400489,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,6.01358937787e+16,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354053600000,60134365798400000,05mar2013,3,2013,2013_03,22.0,161,5,56,1,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,5.551724,0.0
194824,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,6.01358937787e+16,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354057900000,60134365798400000,07may2013,5,2013,2013_05,22.0,224,7,119,4,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,7.724138,0.0
275798,KVIHK14,,25sep2012,25sep2013,365,12,06=Cleaning interview,26sep2013,9,2013,2013_09,27aug2012,60135893778700000,6.01358937787e+16,,HH,,Liability,,Formal,,08jan2013,25sep2013,260,8,60138354057900000,60134365798400000,25jun2013,6,2013,2013_06,22.0,273,9,168,5,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,3. Payments,Repayment on financial liability,0.0,Decrease,1.0,Cash,1.0,0.0,"12=Other institution (school, clinic, church)",Outflow,2000.0,23.52941,0,,,,9.413794,0.0
15825,KVIHK14,,25sep2012,25sep2013,365,12,04=Diaries Interview,25sep2013,9,2013,2013_09,27aug2012,60135893778700000,6.01358937787e+16,,HH,,Liability,Close,Formal,,08jan2013,25sep2013,260,8,60138045552400000,60134365798400000,25sep2013,9,2013,2013_09,21.0,365,12,260,8,findev,"Borrowing, lending, savings or insurance media...",FRMLN,Formal loan,2760,Individual Business or Agriculture Loan,6. Closing Balance--End of last DQ,,0.0,,19.0,CLOSING BALANCE,,,,,0.0,0.0,0,,,,12.58621,0.0
