In [2]:
import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
import random
%matplotlib inline 

In [3]:
# to see all the columns when using head() or describe()
pd.set_option('display.max_columns',60)

In [4]:
df = pd.read_csv("../diaries_transactions_all.csv", dtype={'account_startclose_balance': str})

## For now I keep only "business and agriculture loans", "joint liability loans", "personal loans", "borrowing from an informal group"

In [5]:
dfb = df[((df["trx_family_code"]=="FRMLN")&((df["trx_type_code"]==2760)|(df["trx_type_code"]==3247)|(df["trx_type_code"]==2762)))|((df["trx_family_code"]=="INFGRP")&(df["trx_type_code"]==3395))]

# 1. Get rid of loans of with unexplained balance adjustments

In [6]:
dfb["trx_prx_purpose"].unique()

array(['7. Closing Balance--End of last DQ',
       '6. Closing Balance--End of last DQ',
       '1. Starting balance (today)', '5. Interest accruing',
       'DECREASE--Unexplained balance adjustment',
       'INCREASE--Unexplained balance adjustment', '3. Payments',
       '4. Any known fees', '2. New borrowinng (individual portion)',
       '2. New borrowing', '7. Refund from lender'], dtype=object)

We insert a new column that is 0 if there is no unexplained balance adjustment for this account and a 1 if there is one. The column in 1 or 0 for all entries that correspond to that account.

In [7]:
dfb.insert(dfb.shape[1], "acc_unexplained_trx", np.zeros(dfb.shape[0]))

This program goes through all unique accounts, checks whether it has an unexplained balance adjustment and if yes sets the variable u to 1. Then it goes through another for-loop and sets the entries of that column 0 for all accounts that have an unexplained balance adjustment.

In [8]:
for acc in dfb["account_ids"].unique():
    u = 0
    for index,row in dfb[dfb["account_ids"]==acc].iterrows():
        if row["trx_prx_purpose"]=="DECREASE--Unexplained balance adjustment" or row["trx_prx_purpose"]=="INCREASE--Unexplained balance adjustment":
            u = 1
    for index,row in dfb[dfb["account_ids"]==acc].iterrows():
        dfb.at[index,"acc_unexplained_trx"] = u

### Let us check if it worked.

In [9]:
s = set()
for acc in dfb[dfb["acc_unexplained_trx"]==0]["account_ids"].unique():
     for i in dfb[dfb["account_ids"]==acc]["trx_prx_purpose"].unique():
            s.add(i)
print(s)

{'6. Closing Balance--End of last DQ', '3. Payments', '1. Starting balance (today)', '2. New borrowing', '5. Interest accruing', '7. Closing Balance--End of last DQ', '4. Any known fees', '7. Refund from lender', '2. New borrowinng (individual portion)'}


# Let's now only use the loans that have no unexplained balance adjustments.

In [10]:
dfb = dfb[dfb["acc_unexplained_trx"]==0]

# 2. Truncation into chunks with no new borrowing

We first insert a column for the new account numbers. These will be like the old account numbers but with a .1 or .2 for each chunk after a new borrowing.

In [11]:
# this is sometimes necessary if we mess smt up
# del dfb["new_account_ids"]

In [12]:
dfb.insert(df.columns.get_loc("account_ids")+1,"new_account_ids", ["-"]*(dfb.shape[0]))

### Now let's adjust this such that new borrowings are always the first thing on one day, and then run the same program as before.

In [13]:
for acc in dfb["account_ids"].unique():
    i=0
    # we will use "indices" to save the reordered indices of the dataset for each unique loan
    indices = []
    # we will use "index_bor" to save the index of the new borrowing that happens on a given day,
    #whenever there were also other transactions (since we need that index to be first)
    index_bor = 0
    #only keep the rows of that account
    lna = dfb[dfb["account_ids"]==acc]
    # sort the dataset according to time
    lna = lna.sort_values("trx_stdtime_days_acc")
    # for each day in the dataset of the account, we check whether there was 
    # a new borrowingo or starting balance and if yes reorder
    for d in lna["trx_stdtime_days_acc"].unique():
        # if only one entry that day we just add the index to indices
        if lna[lna["trx_stdtime_days_acc"]==d].shape[0]==1:
            indices.append(lna[lna["trx_stdtime_days_acc"]==d].index[0])
        # if there are more than 1 entry that day
        elif lna[lna["trx_stdtime_days_acc"]==d].shape[0]>1:
            # if there are more than one new borrowings or starting balances, we need to check manually
            if lna[(lna["trx_stdtime_days_acc"]==d)&((lna["trx_prx_purpose"]=="2. New borrowing")|(lna["trx_prx_purpose"]=="2. New borrowing (individual portion)")|(lna["trx_prx_purpose"]=="1. Starting balance (today)"))].shape[0]>1:
                print("Several new borrowings on day " + str(d) +", check manually account " + acc + " in category "+ str(lna["trx_type_code"].unique()))
            #if there is exactly one new borrowing or starting balance that day (here we will need to check for each loan type what they exactly call new borrowings)
            elif lna[(lna["trx_stdtime_days_acc"]==d)&((lna["trx_prx_purpose"]=="2. New borrowing")|(lna["trx_prx_purpose"]=="2. New borrowing (individual portion)")|(lna["trx_prx_purpose"]=="1. Starting balance (today)"))].shape[0]==1:
                #for each type of new borrowing if there is one that day, put that index first into indices
                for kind in ["2. New borrowing","2. New borrowing (individual portion)","1. Starting balance (today)"]:
                    if kind in lna[lna["trx_stdtime_days_acc"]==d]["trx_prx_purpose"].unique():
                        index_bor = lna[(lna["trx_stdtime_days_acc"]==d) & (lna["trx_prx_purpose"]==kind)].index[0]
                        indices.append(index_bor)
                        #then add all other indices of that day
                        for i in lna[lna["trx_stdtime_days_acc"]==d].index.drop(index_bor):
                           indices.append(i)
            # if there are several entries for that day but none of them new borrowings just add all indices of that day            
            else:
                for i in lna[lna["trx_stdtime_days_acc"]==d].index:
                    indices.append(i)
    # here we reindex the lna dataset
    lna = lna.loc[indices,]
    #v will be the index of the chunk we are currently at
    v=1
    # saving the old account number
    acc = lna["account_ids"].unique()[0]
    # a is the new account number for each chunk
    a = lna["account_ids"].unique()[0]+".1"
    # we go through all rows of lna and change the entries of the column "new_account_ids" in the original dataset.
    for index, row in lna.iterrows():
        # if there is a new borrowing in the row, increase v by 1 and set a to acc.v
        if row["trx_prx_purpose"] == "2. New borrowinng (individual portion)" or row["trx_prx_purpose"] == "2. New borrowing":
            v +=1
            a = acc+"."+str(v)
            dfb.at[index,"new_account_ids"]=a
        # otherwise set the new account number in that row to a
        else:
             dfb.at[index,"new_account_ids"]=a

Several new borrowings on day 244, check manually account 61135946960100000 in category [3395]


The account 61135946960100000 seems to have several new borrowings on day 244.

In [14]:
dfb[dfb["account_ids"]=="61135946960100000"][["trx_stdtime_days_acc","trx_prx_purpose","trx_value_kes","trx_bsheet_direction"]]

Unnamed: 0,trx_stdtime_days_acc,trx_prx_purpose,trx_value_kes,trx_bsheet_direction
5052,285,7. Closing Balance--End of last DQ,3800.0,
43429,141,2. New borrowing,2000.0,Increase
118905,0,2. New borrowing,500.0,Increase
195059,5,3. Payments,500.0,Decrease
212855,244,2. New borrowing,1300.0,Increase
295268,244,2. New borrowing,500.0,Increase


### Should we just get rid of this account?

In [15]:
dfb = dfb[dfb["account_ids"]!="61135946960100000"]

In [16]:
dfb.to_csv("diaries_trx_trunc_loans.csv")

### Careful: the column "unique_accnts" is not accurate anymore.

In [20]:
[x for x in dfb.new_account_ids if '.' in x]

['56134804374600000.3',
 '60137430710900000.2',
 '63136740549700000.1',
 '89136459180500000.3',
 '105136540140100000.1',
 '56134761927800000.1',
 '63137182523900000.2',
 '59134985308000000.1',
 '112137283351600000.2',
 '57134667505500000.2',
 '59135902089500000.3',
 '57137812349800000.2',
 '108136904842400000.3',
 '61136145330900000.3',
 '57134649143400000.1',
 '105137049319900000.1',
 '59134760440200000.1',
 '50135447415100000.1',
 '65134752879600000.1',
 '50136912726500000.2',
 '59134692207100000.1',
 '58134814286100000.2',
 '57136557858500000.3',
 '57134702786200000.1',
 '57134702786200000.1',
 '57134702786200000.1',
 '57134702786200000.1',
 '56134761927800000.1',
 '63134753070600000.1',
 '105136612141800000.8',
 '61134770599600000.1',
 '59134666385100000.1',
 '105137637688600000.2',
 '57136539616900000.2',
 '57136557858500000.2',
 '62134908783500000.1',
 '105137101304000000.4',
 '61135352095000000.2',
 '56134691927000000.1',
 '59134726342000000.2',
 '63136740416600000.3',
 '8413655