In [273]:
# Imports
import numpy as np
import datetime,os
import pandas as pd
from fuzzywuzzy import fuzz
import pyodbc
import re
from itertools import product
from ipywidgets import IntProgress
from IPython.display import display

# Identify matches in df and STR
def find_matches(df0,df1,THRESHOLD=0.05):
    # Iterate over all possible pairs
    l_duplicated = []
    for k in product(df0.index.tolist(),df1.index.tolist()):
        if k[0]==k[1]: continue
        # Single out corresponding data rows
        x0 = df0.loc[k[0]].tolist()
        x1 = df1.loc[k[1]].tolist()
        # Accumulate divergence
        c_div = 0
        for kk in range(len(x0)):
            # Distinguish time stamps, strings and floats
            if type(x0[kk]) is pd._libs.tslibs.timestamps.Timestamp:
                c_div += np.abs((x0[kk]-x1[kk]).days)
            elif type(x0[kk]) is str:
                c_div += float((100-fuzz.ratio(x0[kk],x1[kk]))/100)
            elif type(x0[kk]) is np.float64:
                #c_div += np.abs(float(x0[kk])-float(x1[kk]))/(x0[kk]+0.01)
                c_div += np.abs(np.linalg.norm(float(x0[kk])-float(x1[kk]))/max(np.linalg.norm(x1[kk]),0.01))
            else:
                raise Exception('Column name does not match expected format.')
        # Compare to threshold
        if c_div<THRESHOLD: l_duplicated.append(k)
    # Build into a pandas data frame
    matches = pd.DataFrame(l_duplicated).rename(columns={0:'Index 0',1:'Index 1'})
    return matches

# Read in STR table
def read_STR(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Single Transactions table
    STR = pd.read_sql_query(
    '''
    SELECT * FROM STR
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    # Transform STR
    STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
    STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])
    STR['Value_transaction'] = STR['Value_transaction'].astype('float')
    return STR

# Read in TRI table
def read_TRI(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Transaction Information table
    TRI = pd.read_sql_query(
    '''
    SELECT * FROM TRINFORMATION
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    return TRI

# Compare frames and determine similarities
def find_sims(t0,t1):
    # Make sure columns are the same in both input frames
    assert set(t0.columns.tolist())==set(t1.columns.tolist()), 'Columns in input frames are not the same'
    # Create renaming doctionary
    c_map = dict()
    for k in range(2,t0.columns.shape[0]+2):
        c_map[k] = t0.columns.tolist()[k-2]
    # Iterate over all indices
    sim = []
    for k in product(t0.index.tolist(),t1.index.tolist()):
        if k[0]==k[1]: continue
        x0 = t0.loc[k[0]].tolist()
        x1 = t1.loc[k[1]].tolist()
        c_div = [k[0],k[1]]
        for kk in range(len(x0)):
            if type(x0[kk]) is pd._libs.tslibs.timestamps.Timestamp: c_div.append(np.abs((x0[kk]-x1[kk]).days))
            elif type(x0[kk]) is str: c_div.append(float((100-fuzz.ratio(x0[kk],x1[kk]))/100))
            elif type(x0[kk]) is np.float64: c_div.append(np.abs(float(x0[kk])-float(x1[kk])))
            else: raise Exception('Column name does not match expected format.')
        sim.append(c_div)
    # Build output frame
    rf = pd.DataFrame(sim).rename(columns={
        0:'T0 index',
        1:'T1 index'
    }).rename(columns=c_map)
    rf['Total Divergence'] = rf.drop(columns=['T0 index','T1 index']).sum(axis=1)
    return rf

In [183]:
# Load data

dbpath = '..\\..\\..\\..\\banking.accdb'

STR = read_STR(dbpath)
TRI = read_TRI(dbpath)

bf = pd.merge(STR,TRI,on='KEYID',how='left')

In [184]:
# Identify single transactions by known keywords 

bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'CTYPE']= 'SINGLE'
bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'EPARTYID']= '0'
bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'EPARTYNAME']= 'CASHOUT'

In [185]:
df = bf[bf['CTYPE'].isna()]

In [212]:
df0 = df[['Text_transaction','Use','Contact','AccNum','Value_transaction']]
df1 = df[['Text_transaction','Use','Contact','AccNum','Value_transaction']]

In [290]:
tt = pd.DataFrame(list(product(df0.index.tolist(),df1.index.tolist()))).rename(columns={0:'Index 0',1:'Index 1'})
cols = list(set(df0.columns.tolist()).intersection(set(df1.columns.tolist())))
assert all([df0[c_col].dtype==df1[c_col].dtype for c_col in cols]),'Data Frames Error: Shared columns must have the same type'
for c_col in cols: tt['DIV '+c_col]=np.nan
tt = tt.drop(index=tt[tt['Index 0']==tt['Index 1']].index)

In [291]:
tt = tt.head(1000)

In [292]:
f = IntProgress(min=0, max=tt.shape[0]) 
display(f)

for k,v in tt.iterrows():
    for c_col in cols:
        x0 = df0.loc[v[0],c_col]
        x1 = df1.loc[v[1],c_col]
        if type(x0) is pd._libs.tslibs.timestamps.Timestamp:
            c_div = np.abs((x0-x1).days)
        elif type(x0) is str:
            c_div = float((100-fuzz.ratio(x0,x1))/100)
        elif type(x0) is np.float64:
            c_div = np.abs(np.linalg.norm(float(x0)-float(x1)))
        else: raise Exception('Column name does not match expected format.')
        tt.loc[k,'DIV '+c_col] = c_div
    f.value += 1
tt['DIV TOTAL'] = tt.apply(lambda x: sum(x[2:]),axis=1)

IntProgress(value=0, max=1000)

In [297]:
tt['Index 0 Use'] = tt.apply(lambda x: df0.loc[x[0],'Use'],axis=1)
tt['Index 1 Use'] = tt.apply(lambda x: df0.loc[x[1],'Use'],axis=1)

In [302]:
tt[tt['DIV Use']<0.1]

Unnamed: 0,Index 0,Index 1,DIV Use,DIV Text_transaction,DIV AccNum,DIV Contact,DIV Value_transaction,DIV TOTAL,Index 0 Use,Index 1 Use
81,1,93,0.09,0.0,0.0,0.0,0.0,0.09,Abrechnung 30 09 2019 siehe Anlage,Abrechnung 28 06 2019 siehe Anlage
202,1,236,0.09,0.0,0.0,0.0,0.0,0.09,Abrechnung 30 09 2019 siehe Anlage,Abrechnung 29 03 2019 siehe Anlage
327,2,31,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
359,2,67,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
382,2,94,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
417,2,138,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
443,2,170,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
482,2,212,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
503,2,237,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage
522,2,258,0.0,0.0,0.0,0.0,0.0,0.0,Entgeltabrechnung siehe Anlage,Entgeltabrechnung siehe Anlage


In [159]:
possible_matches = find_matches(
    df[['Text_transaction','Use','Contact','AccNum','Value_transaction']],
    df[['Text_transaction','Use','Contact','AccNum','Value_transaction']],
    THRESHOLD=0.05
)

In [160]:
L = []

for k in possible_matches['Index 0'].unique():
    s_t = set(possible_matches[possible_matches['Index 0']==k]['Index 1'].tolist()+[k])
    if not s_t in L: L.append(s_t)

L = [list(k) for k in L]
for k in L: k.sort(reverse=False)

In [181]:
possible_matches

Unnamed: 0,Index 0,Index 1
0,2,31
1,2,67
2,2,94
3,2,138
4,2,170
5,2,212
6,2,237
7,2,258
8,2,280
9,2,307


In [180]:
np.linalg.norm(2539.35 - 2485.77)/np.linalg.norm(2439.35)

0.021964867690163335

In [177]:
bf.loc[L[4]]

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction,KEYID,SCA_id,OrderAccount,...,CustomerReference,CollectorReference,Amount0,Amount1,BIC,Information,Currency,CTYPE,EPARTYID,EPARTYNAME
9,2019-09-27,2019-09-27,LOHN GEHALT,Lohn Gehalt 05119720 201909 HR,Siemens AG,DE90700500000002055382,2539.35,4704501838598879565,,DE98773501100020557849,...,19091705119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,
36,2019-08-29,2019-08-29,LOHN GEHALT,Lohn Gehalt 05119720 201908 HR,Siemens AG,DE90700500000002055382,2539.35,1551398112489727100,,DE98773501100020557849,...,19081905119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,
142,2019-10-30,2019-10-30,LOHN GEHALT,Lohn Gehalt 05119720 201910 HR,Siemens AG,DE90700500000002055382,2539.35,-3418363818552286398,,DE98773501100020557849,...,19101805119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,


In [172]:
bf.loc[L[10]]

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction,KEYID,SCA_id,OrderAccount,...,CustomerReference,CollectorReference,Amount0,Amount1,BIC,Information,Currency,CTYPE,EPARTYID,EPARTYNAME
70,2019-07-30,2019-07-30,LOHN GEHALT,Lohn Gehalt 05119720 201907 HR,Siemens AG,DE90700500000002055382,2485.77,5430006917667314882,,DE98773501100020557849,...,19071805119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,
175,2019-06-27,2019-06-27,LOHN GEHALT,Lohn Gehalt 05119720 201906 HR,Siemens AG,DE90700500000002055382,2485.77,1852275127733353555,,DE98773501100020557849,...,19061405119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,


In [59]:
bf.loc[L[8]]

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction,KEYID,SCA_id,OrderAccount,...,CustomerReference,CollectorReference,Amount0,Amount1,BIC,Information,Currency,CTYPE,EPARTYID,EPARTYNAME
70,2019-07-30,2019-07-30,LOHN GEHALT,Lohn Gehalt 05119720 201907 HR,Siemens AG,DE90700500000002055382,2485.77,5430006917667314882,,DE98773501100020557849,...,19071805119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,
175,2019-06-27,2019-06-27,LOHN GEHALT,Lohn Gehalt 05119720 201906 HR,Siemens AG,DE90700500000002055382,2485.77,1852275127733353555,,DE98773501100020557849,...,19061405119720144701G584K00,,,,BYLADEMMXXX,EUR,Umsatz gebucht,,,


In [258]:
c_idx = 0

df.loc[L[c_idx]]

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction,KEYID,OrderAccount,LenderID,MandateReference,CustomerReference,CollectorReference,Amount0,Amount1,BIC,Information,Currency,ETYPE
1,2019-09-30,2019-10-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,2679922454523302611,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
27,2019-08-30,2019-09-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,5148337244820961257,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
59,2019-07-31,2019-08-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,9058912538237109567,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
82,2019-06-28,2019-06-29,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,635424859654057062,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
120,2019-05-31,2019-06-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,-6001276598425130899,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
154,2019-10-31,2019-11-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,-8413364354163012794,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
180,2019-04-30,2019-05-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,-2150225761108379227,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
203,2019-03-29,2019-03-30,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,-739388074522152766,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
222,2019-02-28,2019-02-28,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,9060357475921488125,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,
241,2019-01-31,2019-02-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0,-4.5,-3450933643258495989,DE98773501100020557849,,,,,,,77350110,EUR,Umsatz gebucht,


In [117]:
# Determine discretionary monthly spending by all single transactions where oney was withdrawn

stra = bf[(bf['ETYPE']=='SINGLE')].reset_index(drop=True)

ds = stra[['Date_booked','Value_transaction']].copy()
ds['YYYY-MM'] = ds.apply(lambda x: x['Date_booked'].strftime('%Y-%m'),axis=1)

ds = ds.drop(columns=['Date_booked']).groupby(['YYYY-MM']).sum().rename(columns={
    'YYYY-MM':'Month',
    'Value_transaction':'WithdrawlsValue'
}).rename_axis(index={'YYYY-MM':'Month'})