In [131]:
# Imports
import numpy as np
import datetime,os
import pandas as pd
from fuzzywuzzy import fuzz
import pyodbc
import re
from itertools import product

# Identify matches in df and STR
def find_matches(df0,df1,THRESHOLD=0.05):
    # Iterate over all possible pairs
    l_duplicated = []
    for k in product(df0.index.tolist(),df1.index.tolist()):
        if k[0]==k[1]: continue
        # Single out corresponding data rows
        x0 = df0.iloc[k[0]].tolist()
        x1 = df1.iloc[k[1]].tolist()
        # Accumulate divergence
        c_div = 0
        for kk in range(len(x0)):
            # Distinguish time stamps, strings and floats
            if type(x0[kk]) is pd._libs.tslibs.timestamps.Timestamp:
                c_div += np.abs((x0[kk]-x1[kk]).days)
            elif type(x0[kk]) is str:
                c_div += float((100-fuzz.ratio(x0[kk],x1[kk]))/100)
            elif type(x0[kk]) is np.float64:
                c_div += np.abs(float(x0[kk])-float(x1[kk]))
            else:
                raise Exception('Column name does not match expected format.')
        # Compare to threshold
        if c_div<THRESHOLD: l_duplicated.append(k)
    # Build into a pandas data frame
    matches = pd.DataFrame(l_duplicated).rename(columns={0:'DF index',1:'STR index'})
    return matches

# Read in STR table
def read_STR(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Single Transactions table
    STR = pd.read_sql_query(
    '''
    SELECT * FROM STR
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    # Transform STR
    STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
    STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])
    STR['Value_transaction'] = STR['Value_transaction'].astype('float')
    return STR

# Read in TRI table
def read_TRI(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Transaction Information table
    TRI = pd.read_sql_query(
    '''
    SELECT * FROM TRINFORMATION
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    return TRI

# Compare frames and determine similarities
def find_sims(t0,t1):
    # Make sure columns are the same in both input frames
    assert set(t0.columns.tolist())==set(t1.columns.tolist()), 'Columns in input frames are not the same'
    # Create renaming doctionary
    c_map = dict()
    for k in range(2,t0.columns.shape[0]+2):
        c_map[k] = t0.columns.tolist()[k-2]
    # Iterate over all indices
    sim = []
    for k in product(t0.index.tolist(),t1.index.tolist()):
        if k[0]==k[1]: continue
        x0 = t0.iloc[k[0]].tolist()
        x1 = t1.iloc[k[1]].tolist()
        c_div = [k[0],k[1]]
        for kk in range(len(x0)):
            if type(x0[kk]) is pd._libs.tslibs.timestamps.Timestamp: c_div.append(np.abs((x0[kk]-x1[kk]).days))
            elif type(x0[kk]) is str: c_div.append(float((100-fuzz.ratio(x0[kk],x1[kk]))/100))
            elif type(x0[kk]) is np.float64: c_div.append(np.abs(float(x0[kk])-float(x1[kk])))
            else: raise Exception('Column name does not match expected format.')
        sim.append(c_div)
    # Build output frame
    rf = pd.DataFrame(sim).rename(columns={
        0:'T0 index',
        1:'T1 index'
    }).rename(columns=c_map)
    rf['Total Divergence'] = rf.drop(columns=['T0 index','T1 index']).sum(axis=1)
    return rf

In [132]:
dbpath = '..\\..\\..\\..\\banking.accdb'

STR = read_STR(dbpath)
TRI = read_TRI(dbpath)

bf = pd.merge(STR,TRI,on='KEYID',how='left')

In [133]:
df0 = bf[bf['Text_transaction']=='BARGELDAUSZAHLUNG'].reset_index(drop=True)
df1 = bf[~(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].reset_index(drop=True)

In [134]:
possible_matches = find_matches(
    df1[['Text_transaction','Use','Contact','AccNum','Value_transaction']],
    df1[['Text_transaction','Use','Contact','AccNum','Value_transaction']]
)

In [135]:
tt = df1.iloc[possible_matches[possible_matches['DF index']==2]['STR index'].tolist()].drop(columns=['Date_booked','Date_ordered','KEYID'])

In [138]:
find_sims(tt,tt)

IndexError: single positional indexer is out-of-bounds

In [140]:
for k in product(tt.index.tolist(),tt.index.tolist()):
    print(k)

(33, 33)
(33, 64)
(33, 78)
(33, 125)
(33, 160)
(33, 187)
(33, 204)
(33, 223)
(33, 247)
(33, 265)
(33, 289)
(64, 33)
(64, 64)
(64, 78)
(64, 125)
(64, 160)
(64, 187)
(64, 204)
(64, 223)
(64, 247)
(64, 265)
(64, 289)
(78, 33)
(78, 64)
(78, 78)
(78, 125)
(78, 160)
(78, 187)
(78, 204)
(78, 223)
(78, 247)
(78, 265)
(78, 289)
(125, 33)
(125, 64)
(125, 78)
(125, 125)
(125, 160)
(125, 187)
(125, 204)
(125, 223)
(125, 247)
(125, 265)
(125, 289)
(160, 33)
(160, 64)
(160, 78)
(160, 125)
(160, 160)
(160, 187)
(160, 204)
(160, 223)
(160, 247)
(160, 265)
(160, 289)
(187, 33)
(187, 64)
(187, 78)
(187, 125)
(187, 160)
(187, 187)
(187, 204)
(187, 223)
(187, 247)
(187, 265)
(187, 289)
(204, 33)
(204, 64)
(204, 78)
(204, 125)
(204, 160)
(204, 187)
(204, 204)
(204, 223)
(204, 247)
(204, 265)
(204, 289)
(223, 33)
(223, 64)
(223, 78)
(223, 125)
(223, 160)
(223, 187)
(223, 204)
(223, 223)
(223, 247)
(223, 265)
(223, 289)
(247, 33)
(247, 64)
(247, 78)
(247, 125)
(247, 160)
(247, 187)
(247, 204)
(247, 223)
(247