In [1]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
from numpy import random as rnd
import warnings,datetime,os,calendar,csv,time

import pickle,h5py,json

import tensorflow as tf
import pandas as pd
import seaborn as sns

from keras.models import Model,Sequential
from keras.layers import Dense,LSTM,Conv2D,Dropout,BatchNormalization,Input,Concatenate,Add,Activation,MaxPooling2D,AveragePooling2D
import keras.backend as K

from sklearn import preprocessing as pp
from sklearn.cluster import KMeans,DBSCAN,MeanShift,AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from statsmodels.tsa.seasonal import seasonal_decompose

from fuzzywuzzy import fuzz
from itertools import product

import ipywidgets as widgets
import pyodbc

sns.set()
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def assemble_sql_write_query(table_name,row):
    sql_write_query = 'INSERT INTO '+table_name+'('
    for colname in row.index:
        sql_write_query += '['
        sql_write_query += colname
        sql_write_query += '],'
    sql_write_query = sql_write_query[:-1]
    sql_write_query += ') VALUES('
    for colname in row.index:
        sql_write_query += '\''
        sql_write_query += str(row[colname])
        sql_write_query += '\''
        sql_write_query += ','
    sql_write_query = sql_write_query[:-1]
    sql_write_query += ')'
    return sql_write_query

In [3]:
def pandas_to_access_clean_write(conn,df,table_name,verbose=False):
    cursor = conn.cursor()
    
    sql_delete_query = 'DELETE FROM '+table_name
    cursor.execute(sql_delete_query)
    conn.commit()
    
    for index,row in df.iterrows():
        sql_write_query = assemble_sql_write_query(table_name,row)
        if verbose: print(sql_write_query+'\n\n')
        cursor.execute(sql_write_query)
        conn.commit()
    cursor.close()
    
    return

In [4]:
def access_clear_table(conn,table_name):

    cursor = conn.cursor()

    sql_delete_query = 'DELETE FROM '+table_name
    cursor.execute(sql_delete_query)
    conn.commit()

    cursor.close()
    
    return

In [26]:
# Connect to Access DB
conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ=C:\Users\Jan\Documents\banking.accdb;')

# Read Main Customer Accounts table
MCA = pd.read_sql_query(
'''
SELECT * FROM MCA
'''
,conn).set_index('MCA_id').reset_index(drop=True)
# Read Single Customer Accounts table
SCA = pd.read_sql_query(
'''
SELECT * FROM SCA
'''
,conn).set_index('SCA_id').reset_index(drop=True)
# Read Single Transactions table
STR = pd.read_sql_query(
'''
SELECT * FROM STR
'''
,conn).set_index('STR_id').reset_index(drop=True)
# Read Transaction Information table
TRI = pd.read_sql_query(
'''
SELECT * FROM TRINFORMATION
'''
,conn).set_index('STR_id').reset_index(drop=True)

# Close connection
conn.close()

In [27]:
STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])

STR['Value_transaction'] = STR['Value_transaction'].astype('float')

In [7]:
bf = pd.read_excel('..\\..\\TF_data\\ADMIN\\01\\20191003.xlsx')
#bf = pd.read_excel('..\\..\\TF_data\\ADMIN\\01\\20191003.xls')

bf['Buchungstag'] = pd.to_datetime(bf['Buchungstag'],format='%d.%m.%y')
bf['Valutadatum'] = pd.to_datetime(bf['Valutadatum'],format='%d.%m.%y')

In [8]:
df = bf[['Buchungstag','Valutadatum','Buchungstext','Verwendungszweck','Beguenstigter/Zahlungspflichtiger','Kontonummer/IBAN','Betrag']].rename(columns={
    'Valutadatum':'Date_ordered', 
    'Buchungstag':'Date_booked', 
    'Buchungstext':'Text_transaction', 
    'Verwendungszweck':'Use',
    'Beguenstigter/Zahlungspflichtiger':'Contact', 
    'Kontonummer/IBAN':'AccNum', 
    'Betrag':'Value_transaction'
})
df.index = df.index.rename('STR_id')

In [9]:
af = bf[['Auftragskonto','Glaeubiger ID','Mandatsreferenz','Kundenreferenz (End-to-End)','Sammlerreferenz','Lastschrift Ursprungsbetrag','Auslagenersatz Ruecklastschrift','BIC (SWIFT-Code)','Waehrung','Info']].rename(columns={
    'Auftragskonto':'OrderAccount', 
    'Glaeubiger ID':'LenderID', 
    'Mandatsreferenz':'MandateReference',
    'Kundenreferenz (End-to-End)':'CustomerReference', 
    'Sammlerreferenz':'CollectorReference',
    'Lastschrift Ursprungsbetrag':'Amount0', 
    'Auslagenersatz Ruecklastschrift':'Amount1',
    'BIC (SWIFT-Code)':'BIC', 
    'Waehrung':'Currency', 
    'Info':'Information'
})
af.index = af.index.rename('STR_id')

In [None]:
# MCA, SCA, STR, TRI

# af, bf

In [131]:
def find_row_in_STR(df,of,df_idx,THRESHOLD=0.05):
    assert df_idx in df.index,'Index not in data frame.'
    assert set(df.columns).issubset(set(of.columns)),'Some required columns were not found in table file.'
    of = of[df.columns.tolist()]
    rl = []
    for c_idx in of.index:
        tl = []
        for c_col in of.columns:
            assert df[c_col].dtype==of[c_col].dtype,'Found non-matching column types between comparison data frames.'
            if df[c_col].dtype=='datetime64[ns]':
                tl.append(np.abs((df.ix[df_idx,c_col]-of.ix[c_idx,c_col]).days))
                continue
            if df[c_col].dtype=='object':
                tl.append((100-fuzz.ratio(str(df.ix[df_idx,c_col]),str(of.ix[c_idx,c_col])))/100)
                continue
            if df[c_col].dtype=='float64':
                tl.append(np.abs(df.ix[df_idx,c_col]-of.ix[c_idx,c_col]))
                continue
            raise Exception('Column name does not match expected format.')
        rl.append(tl)
    rf = pd.DataFrame(rl,index=of.index).rename(columns={
        0:df.columns[0],
        1:df.columns[1],
        2:df.columns[2],
        3:df.columns[3],
        4:df.columns[4],
        5:df.columns[5],
        6:df.columns[6]
    })
    row_found = -1
    divergence_found = -1
    if rf[rf.sum(axis=1)<0.05].index.shape[0]>0: 
        row_found = rf[rf.sum(axis=1)<THRESHOLD].index.tolist()
        divergence_found = rf.sum(axis=1).ix[row_found].tolist()
    return row_found,divergence_found,rf

In [137]:
for c_index in df.index:
    

RangeIndex(start=0, stop=161, step=1, name='STR_id')

In [135]:
rf

Unnamed: 0,Date_booked,Date_ordered,Text_transaction,Use,Contact,AccNum,Value_transaction
0,0,0,0.00,0.00,0.00,0.00,0.00
1,2,0,0.62,0.77,1.00,0.69,50.00
2,2,0,0.52,0.85,1.00,0.56,45.50
3,2,1,0.59,0.81,0.92,0.32,185.00
4,2,1,0.64,0.64,0.88,0.36,25.10
5,2,1,0.64,0.77,0.90,0.45,16.89
6,2,1,0.64,0.69,0.93,0.50,14.90
7,2,1,0.64,0.73,0.59,0.45,12.00
8,2,1,0.64,0.85,0.91,0.41,47.95
9,5,4,0.72,0.72,0.81,0.41,2589.35
