In [1]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
from numpy import random as rnd
import warnings,datetime,os,calendar,csv,time

import pickle,h5py,json

import tensorflow as tf
import pandas as pd
import seaborn as sns

from keras.models import Model,Sequential
from keras.layers import Dense,LSTM,Conv2D,Dropout,BatchNormalization,Input,Concatenate,Add,Activation,MaxPooling2D,AveragePooling2D
import keras.backend as K

from sklearn import preprocessing as pp
from sklearn.cluster import KMeans,DBSCAN,MeanShift,AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from statsmodels.tsa.seasonal import seasonal_decompose

from fuzzywuzzy import fuzz
from itertools import product

import ipywidgets as widgets
import pyodbc

sns.set()
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def assemble_sql_write_query(table_name,row):
    sql_write_query = 'INSERT INTO '+table_name+'('
    for colname in row.index:
        sql_write_query += '['
        sql_write_query += colname
        sql_write_query += '],'
    sql_write_query = sql_write_query[:-1]
    sql_write_query += ') VALUES('
    for colname in row.index:
        sql_write_query += '\''
        sql_write_query += str(row[colname])
        sql_write_query += '\''
        sql_write_query += ','
    sql_write_query = sql_write_query[:-1]
    sql_write_query += ')'
    return sql_write_query

In [3]:
def pandas_to_access_clean_write(conn,df,table_name,verbose=False):
    cursor = conn.cursor()
    
    sql_delete_query = 'DELETE FROM '+table_name
    cursor.execute(sql_delete_query)
    conn.commit()
    
    for index,row in df.iterrows():
        sql_write_query = assemble_sql_write_query(table_name,row)
        if verbose: print(sql_write_query+'\n\n')
        cursor.execute(sql_write_query)
        conn.commit()
    cursor.close()
    
    return

In [4]:
def access_clear_table(conn,table_name):

    cursor = conn.cursor()

    sql_delete_query = 'DELETE FROM '+table_name
    cursor.execute(sql_delete_query)
    conn.commit()

    cursor.close()
    
    return

In [5]:
def find_row_in_STR(df,of,df_idx,THRESHOLD=0.05):
    assert df_idx in df.index,'Index not in data frame.'
    assert set(df.columns).issubset(set(of.columns)),'Some required columns were not found in table file.'
    of = of[df.columns.tolist()]
    assert all([df[c_col].dtype==of[c_col].dtype for c_col in of.columns]),'Found non-matching column types between comparison data frames.'
    rl = []
    for c_idx in of.index:
        tl = []
        exceeds_threshold = False
        for c_col in of.columns:
            if df[c_col].dtype=='datetime64[ns]':
                c_div = np.abs((df.ix[df_idx,c_col]-of.ix[c_idx,c_col]).days)
            elif df[c_col].dtype=='object':
                c_div = float((100-fuzz.ratio(str(df.ix[df_idx,c_col]),str(of.ix[c_idx,c_col])))/100)
            elif df[c_col].dtype=='float64':
                c_div = np.abs(df.ix[df_idx,c_col]-of.ix[c_idx,c_col])
            else:
                raise Exception('Column name does not match expected format.')
            if c_div>THRESHOLD: 
                exceeds_threshold = True
                break
            tl.append(c_div)
        if not exceeds_threshold: rl.append([c_idx,tl])
    return rl

In [6]:
def DF_STR_find_matches(df,STR):
    matches = []
    for df_idx in df.index:
        res = find_row_in_STR(df,STR,df_idx)
        if len(res)==0:
            matches.append([df_idx,int(-1)])
        elif len(res)==1:
            matches.append([df_idx,int(res[0][0])])
        else: raise Exception('Multiple instances of row detected, check database.')
    matches = pd.DataFrame(matches).rename(columns={0:'DF index',1:'STR index'}).set_index('DF index')
    return matches

In [7]:
def read_newdata(dirpath):
    
    # Read new data
    bf = pd.DataFrame()
    for file in os.listdir(dirpath):
        tf = pd.read_excel(dirpath+file)
        bf = pd.concat([bf,tf])

    # Transform new data
    bf['Buchungstag'] = pd.to_datetime(bf['Buchungstag'],format='%d.%m.%y')
    bf['Valutadatum'] = pd.to_datetime(bf['Valutadatum'],format='%d.%m.%y')

    # Transform base data
    df = bf[['Buchungstag','Valutadatum','Buchungstext','Verwendungszweck','Beguenstigter/Zahlungspflichtiger','Kontonummer/IBAN','Betrag']].rename(columns={
        'Valutadatum':'Date_ordered', 
        'Buchungstag':'Date_booked', 
        'Buchungstext':'Text_transaction', 
        'Verwendungszweck':'Use',
        'Beguenstigter/Zahlungspflichtiger':'Contact', 
        'Kontonummer/IBAN':'AccNum', 
        'Betrag':'Value_transaction'
    })
    df.index = df.index.rename('STR_id')

    # Transform added information
    af = bf[['Auftragskonto','Glaeubiger ID','Mandatsreferenz','Kundenreferenz (End-to-End)','Sammlerreferenz','Lastschrift Ursprungsbetrag','Auslagenersatz Ruecklastschrift','BIC (SWIFT-Code)','Waehrung','Info']].rename(columns={
        'Auftragskonto':'OrderAccount', 
        'Glaeubiger ID':'LenderID', 
        'Mandatsreferenz':'MandateReference',
        'Kundenreferenz (End-to-End)':'CustomerReference', 
        'Sammlerreferenz':'CollectorReference',
        'Lastschrift Ursprungsbetrag':'Amount0', 
        'Auslagenersatz Ruecklastschrift':'Amount1',
        'BIC (SWIFT-Code)':'BIC', 
        'Waehrung':'Currency', 
        'Info':'Information'
    })
    af.index = af.index.rename('STR_id')
    
    return bf,df,af

In [8]:
bf,df,af = read_newdata('..\\..\\..\\TF_data\\ADMIN\\01\\')

In [10]:
#def read_STR():

# Connect to Access DB
conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=C:\Users\Jan\Documents\banking.accdb;')
# Read Single Transactions table
STR = pd.read_sql_query(
'''
SELECT * FROM STR
'''
,conn).set_index('STR_id').reset_index(drop=True)
# Close connection
conn.close()

# Transform STR
STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])
STR['Value_transaction'] = STR['Value_transaction'].astype('float')
    
#return STR

In [11]:
STR

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction,SCA_id
0,2019-10-02,2019-10-01,BARGELDAUSZAHLUNG,2019-10-01T22:08 Debitk.5 2021-12,DECHSENDOR//SPARKASSE ERLANGEN/DE,DE63763500009000481424,-50.00,
1,2019-09-30,2019-10-01,ABSCHLUSS,Abrechnung 30.09.2019 siehe Anlage,,0020557849,0.00,
2,2019-09-30,2019-10-01,ENTGELTABSCHLUSS,Entgeltabrechnung siehe Anlage,,0000000000,-4.50,
3,2019-09-30,2019-09-30,DAUERAUFTRAG,"Miete fuer Haesschen, du bist toll :)",Elaine Fernandez,DE46763500000044116613,-235.00,
4,2019-09-30,2019-09-30,FOLGELASTSCHRIFT,V-Nr. 47589 09-19 Beitrag 24.90,SLF Sportland Franken GmbH & Co. KG,DE88763500000000062725,-24.90,
5,2019-09-30,2019-09-30,FOLGELASTSCHRIFT,300919 15194815 610971995391 SIEMENS CASINO CA...,Siemens Aktiengesellschaft,DE53700700100203008800,-66.89,
6,2019-09-30,2019-09-30,FOLGELASTSCHRIFT,Mobilfunk Kundenkonto 0051900308 RG 2922363700...,Telekom Deutschland GmbH ...,DE68700202700667302269,-64.90,
7,2019-09-30,2019-09-30,FOLGELASTSCHRIFT,"VST 458207, 62,00 EUR A19-656666",ESTW-ERLANGERSTADTWERKEAG,DE06763200720004536703,-62.00,
8,2019-09-30,2019-09-30,FOLGELASTSCHRIFT,BEITRAG BIS 09/19 MANDATSREF 4005172401 GLAEUB...,IGM Erlangen,DE28500500000083044008,-2.05,
9,2019-09-27,2019-09-27,LOHN GEHALT,"Lohn/Gehalt,05119720/201909/HR",Siemens AG,DE90700500000002055382,2539.35,


In [126]:
# Detect and remove existing entries
matches = DF_STR_find_matches(df,STR)
new_data = df.ix[matches[matches['STR index']<0].index]
new_data['SCA_id'] = 'None'

# Make new STR file
STR_new = pd.concat([STR,new_data]).reset_index(drop=True)

In [None]:
z

In [119]:
# Connect to Access DB
conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=C:\Users\Jan\Documents\banking.accdb;')

# Read Transaction Information table
TRI = pd.read_sql_query(
'''
SELECT * FROM TRINFORMATION
'''
,conn).set_index('STR_id').reset_index(drop=True)

# Close connection
conn.close()