In [1]:
# Imports
import numpy as np
import datetime,os
import pandas as pd
from fuzzywuzzy import fuzz
import pyodbc
import re
from itertools import product

# Identify matches in df and STR
def find_matches(df,STR,THRESHOLD=0.01):
    # Iterate over all possible pairs
    l_duplicated = []
    for k in product(df.index.tolist(),STR.index.tolist()):
        # Single out corresponding data rows
        x0 = df.iloc[k[0]].tolist()
        x1 = STR.iloc[k[1]].tolist()
        # Accumulate divergence
        c_div = 0
        for kk in range(len(x0)):
            # Distinguish time stamps, strings and floats
            if type(x0[kk]) is pd._libs.tslibs.timestamps.Timestamp:
                c_div += np.abs((x0[kk]-x1[kk]).days)
            elif type(x0[kk]) is str:
                c_div += float((100-fuzz.ratio(x0[kk],x1[kk]))/100)
            elif type(x0[kk]) is np.float64:
                c_div += np.abs(float(x0[kk])-float(x1[kk]))
            else:
                raise Exception('Column name does not match expected format.')
        # Compare to threshold
        if c_div<THRESHOLD: l_duplicated.append(k)
    # Build into a pandas data frame
    matches = pd.DataFrame(l_duplicated).rename(columns={0:'DF index',1:'STR index'})
    return matches

# Read in new data
def read_newdata(dirpath):
    # Read new data
    bf = pd.DataFrame()
    for file in os.listdir(dirpath):
        tf = pd.read_excel(dirpath+file)
        if tf['Betrag'].dtype=='int64': tf['Betrag'] = tf['Betrag'].apply(lambda x: float(str(x)[:-2]+'.'+str(x)[-2:]))
        bf = pd.concat([bf,tf],sort=False)
    # Transform new data
    bf['Buchungstag'] = pd.to_datetime(bf['Buchungstag'],format='%d.%m.%y')
    bf['Valutadatum'] = pd.to_datetime(bf['Valutadatum'],format='%d.%m.%y')
    # Clean string columns
    for colname in ['Buchungstext','Verwendungszweck','Beguenstigter/Zahlungspflichtiger']:
        bf[colname] = bf[colname].map(lambda x: re.sub(r'\W+', ' ', str(x)))
    # Key data and remove duplicates
    bf['KEYID'] = bf[['Auftragskonto','Buchungstag','Buchungstext','Verwendungszweck','Kontonummer/IBAN','BIC (SWIFT-Code)','Betrag']].apply(lambda x: str(hash(''.join(str(x.values).replace('\'','').replace('\n','').replace('\t','').replace(' ','')))),axis=1)
    bf = bf[~bf['KEYID'].duplicated()].reset_index(drop=True)
    
    # Transform base data
    df = bf[['Buchungstag','Valutadatum','Buchungstext','Verwendungszweck','Beguenstigter/Zahlungspflichtiger','Kontonummer/IBAN','Betrag','KEYID']].rename(columns={
        'Valutadatum':'Date_ordered', 
        'Buchungstag':'Date_booked', 
        'Buchungstext':'Text_transaction', 
        'Verwendungszweck':'Use',
        'Beguenstigter/Zahlungspflichtiger':'Contact', 
        'Kontonummer/IBAN':'AccNum', 
        'Betrag':'Value_transaction'
    }).reset_index(drop=True)
    # Transform added information
    af = bf[['Auftragskonto','Glaeubiger ID','Mandatsreferenz','Kundenreferenz (End-to-End)','Sammlerreferenz','Lastschrift Ursprungsbetrag','Auslagenersatz Ruecklastschrift','BIC (SWIFT-Code)','Waehrung','Info','KEYID']].rename(columns={
        'Auftragskonto':'OrderAccount', 
        'Glaeubiger ID':'LenderID', 
        'Mandatsreferenz':'MandateReference',
        'Kundenreferenz (End-to-End)':'CustomerReference', 
        'Sammlerreferenz':'CollectorReference',
        'Lastschrift Ursprungsbetrag':'Amount0', 
        'Auslagenersatz Ruecklastschrift':'Amount1',
        'BIC (SWIFT-Code)':'BIC', 
        'Waehrung':'Currency', 
        'Info':'Information'
    }).reset_index(drop=True)
    return bf,df,af

# Read in STR table
def read_STR(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Single Transactions table
    STR = pd.read_sql_query(
    '''
    SELECT * FROM STR
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    # Transform STR
    STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
    STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])
    STR['Value_transaction'] = STR['Value_transaction'].astype('float')
    return STR

# Read in TRI table
def read_TRI(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Transaction Information table
    TRI = pd.read_sql_query(
    '''
    SELECT * FROM TRINFORMATION
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    return TRI

# Main function
def __main__():    
    # Path definitions
    newdatapath = '..\\..\\..\\TF_data\\ADMIN\\01\\'
    dbpath = '..\\..\\..\\..\\banking.accdb'
    # Read new input
    bf,df,af = read_newdata(newdatapath)
    # Read existing databases
    STR = read_STR(dbpath)
    TRI = read_TRI(dbpath)
    # Find duplicates
    matches = find_matches(df,STR)
    # Isolate new data entries
    if matches.shape[0]==0:
        new_data = df.copy()
        new_added_information = af.copy()
    else:
        new_data = df.drop(index=matches['DF index'].tolist()).reset_index(drop=True)
        new_added_information = af.drop(index=matches['DF index'].tolist()).reset_index(drop=True)
    # Fill nulls with something that makes sense in a DB
    new_data = new_data.fillna('nan')
    new_added_information = new_added_information.fillna('nan')
    # Establish DB connection
    engine = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Write to STR
    for t in new_data.iterrows():
        cursor = engine.cursor()
        sql_query = '''
        INSERT INTO STR VALUES(
        '''
        sql_query = sql_query + str(t[1].tolist()).replace('[','').replace(']','').replace('Timestamp(','').replace(')','')
        sql_query = sql_query.replace('\n','') + ');'
        cursor.execute(sql_query)
        engine.commit()
    # Close connection
    engine.close()
    # Establish DB connection
    engine = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Write to TRINFORMATION
    for t in new_added_information.iterrows():
        cursor = engine.cursor()
        sql_query = '''
        INSERT INTO TRINFORMATION VALUES(
        '''
        sql_query = sql_query + str(t[1].tolist()).replace('[','').replace(']','').replace('Timestamp(','').replace(')','')
        sql_query = sql_query.replace('\n','') + ');'
        cursor.execute(sql_query)
        engine.commit()
    # Close connection
    engine.close()
    # Return
    return new_data,new_added_information

# Call main
new_entries_STR,new_entries_TRINFORMATION = __main__()