In [1]:
# Imports
import numpy as np
import datetime,os
import pandas as pd
from fuzzywuzzy import fuzz
import pyodbc
import re
from itertools import product
from ipywidgets import IntProgress
from IPython.display import display

# Read in STR table
def read_STR(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Single Transactions table
    STR = pd.read_sql_query(
    '''
    SELECT * FROM STR
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    # Transform STR
    STR['Date_booked'] = pd.to_datetime(STR['Date_booked'])
    STR['Date_ordered'] = pd.to_datetime(STR['Date_ordered'])
    STR['Value_transaction'] = STR['Value_transaction'].astype('float')
    return STR

# Read in TRI table
def read_TRI(dbpath):
    # Connect to Access DB
    conn = pyodbc.connect(r'Driver={Microsoft Access Driver (*.mdb, *.accdb)};DBQ='+dbpath+';')
    # Read Transaction Information table
    TRI = pd.read_sql_query(
    '''
    SELECT * FROM TRINFORMATION
    '''
    ,conn).reset_index(drop=True)
    # Close connection
    conn.close()
    return TRI

In [2]:
# Load data

dbpath = '..\\..\\..\\..\\banking.accdb'

STR = read_STR(dbpath)
TRI = read_TRI(dbpath)

bf = pd.merge(STR,TRI,on='KEYID',how='left')

In [3]:
# Identify single transactions by known keywords 

bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'CTYPE']= 'SINGLE'
bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'EPARTYID']= '0'
bf.loc[bf[(bf['Text_transaction']=='BARGELDAUSZAHLUNG')].index,'EPARTYNAME']= 'CASHOUT'

In [4]:
df0 = bf[bf['Text_transaction']!='BARGELDAUSZAHLUNG'].reset_index(drop=True)
df0['CustomerKEY'] = df0.apply(lambda x: str(x['Contact'])+'|'+str(x['AccNum'])+'|'+str(x['Use']),axis=1)

In [5]:
L = []
for k in product(df0['CustomerKEY'].unique(),df0['CustomerKEY'].unique()):
    L.append([k[0].lower(),k[1].lower()])

In [6]:
df1 = pd.DataFrame(L).rename(columns={0:'A0',1:'A1'})

In [7]:
df1['A0NAME'] = df1.apply(lambda x: x['A0'].split('|')[0],axis=1)
df1['A0NUMBER'] = df1.apply(lambda x: x['A0'].split('|')[1],axis=1)
df1['A0USE'] = df1.apply(lambda x: x['A0'].split('|')[2],axis=1)

df1['A1NAME'] = df1.apply(lambda x: x['A1'].split('|')[0],axis=1)
df1['A1NUMBER'] = df1.apply(lambda x: x['A1'].split('|')[1],axis=1)
df1['A1USE'] = df1.apply(lambda x: x['A1'].split('|')[2],axis=1)

In [8]:
df1['SIM_name'] = df1.apply(lambda x: float(fuzz.ratio(x['A0NAME'],x['A1NAME']))/100,axis=1)
df1['SIM_num'] = df1.apply(lambda x: float(fuzz.ratio(x['A0NUMBER'],x['A1NUMBER']))/100,axis=1)
df1['SIM_use'] = df1.apply(lambda x: float(fuzz.ratio(x['A0USE'],x['A1USE']))/100,axis=1)

In [9]:
df2 = df1[['A0','A1','SIM_name','SIM_num','SIM_use']]

In [11]:
import plotly.express as px

df = px.data.iris()
fig = px.scatter_3d(df2, x='SIM_name', y='SIM_num', z='SIM_use')
fig.show()

In [15]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN()

Unnamed: 0,A0,A1,SIM_name,SIM_num,SIM_use
0,nan|0020557849|abrechnung 30 09 2019 siehe anl...,nan|0020557849|abrechnung 30 09 2019 siehe anl...,1.00,1.00,1.00
1,nan|0020557849|abrechnung 30 09 2019 siehe anl...,nan|0000000000|entgeltabrechnung siehe anlage,1.00,0.30,0.73
2,nan|0020557849|abrechnung 30 09 2019 siehe anl...,elaine fernandez|de46763500000044116613|miete ...,0.32,0.25,0.32
3,nan|0020557849|abrechnung 30 09 2019 siehe anl...,slf sportland franken gmbh co kg|de88763500000...,0.17,0.25,0.39
4,nan|0020557849|abrechnung 30 09 2019 siehe anl...,siemens aktiengesellschaft|de53700700100203008...,0.21,0.31,0.32
5,nan|0020557849|abrechnung 30 09 2019 siehe anl...,telekom deutschland gmbh landgrabenweg 151|de6...,0.13,0.38,0.35
6,nan|0020557849|abrechnung 30 09 2019 siehe anl...,estw erlangerstadtwerkeag|de067632007200045367...,0.14,0.38,0.30
7,nan|0020557849|abrechnung 30 09 2019 siehe anl...,igm erlangen|de28500500000083044008|beitrag bi...,0.27,0.31,0.26
8,nan|0020557849|abrechnung 30 09 2019 siehe anl...,siemens ag|de90700500000002055382|lohn gehalt ...,0.31,0.44,0.42
9,nan|0020557849|abrechnung 30 09 2019 siehe anl...,amazon instant video germany gmbh|de0730030880...,0.17,0.25,0.29
