In [1]:
# Import banking data
# Note: This data was extracted on 1999

# Install: conda install -c anaconda pandasql
# Use SQL in pandas DataFrame
# Ref: https://pypi.python.org/pypi/pandasql
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())  # Allow sqldf to access global environment

import pandas as pd

# Each record describes characteristics of a client
client = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/client.asc', sep=';')

# Each record describes static characteristics of an account
account = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/account.asc', sep=';')

# Each record describes a credit card issued to an account
card = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/card.asc', sep=';')

# Each record describes demographic characteristics of a district
district = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/district.asc', sep=';')

# Each record relates together a client with an account
# i.e. this relation describes the rights of clients to operate accounts
disp = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/disp.asc', sep=';')

# Each record describes characteristics of a payment order (debits only)
order = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/order.asc', sep=';')

# Each record describes one transaction on an account
trans = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/trans.asc', sep=';', low_memory=False)

# Each record describes a loan granted for a given account
loan = pd.read_csv('C:/Users/ashkurin/Documents/FP/Section2/data/data_berka/loan.asc', sep=';')

In [2]:
#Set independent variables in the timeline

ind_var_trans = trans[trans['date'].astype(str).str[:2].isin(['96'])]

ind_var_acc = account[account['date'].astype(str).str[:2].isin(['96'])]

#Set dependent variables in the timeline

dep_var = trans[trans['date'].astype(str).str[:2].isin(['97'])]

In [3]:
#Transactions per account ID

trans_per_acc_id = ind_var_trans.groupby("account_id")["trans_id"].agg("count")
trans_per_acc_id

account_id
1         74
2         85
4         39
6         63
7          4
        ... 
11333     88
11349     88
11359     95
11362    102
11382     82
Name: trans_id, Length: 3602, dtype: int64

In [4]:
#Amount of withdrawl per account ID

AmountOfWithdrawl = pysqldf("""
SELECT count(*) AS count
FROM ind_var_trans
WHERE type = "VYDAJ"
GROUP BY account_id
""")
AmountOfWithdrawl

Unnamed: 0,count
0,45
1,51
2,22
3,39
4,1
...,...
3294,35
3295,60
3296,69
3297,78


In [5]:
#Amount of credit per account ID

AmountOfCredit = pysqldf("""
SELECT count(*) AS count
FROM ind_var_trans
WHERE type = "PRIJEM"
GROUP BY account_id
""")
AmountOfCredit

Unnamed: 0,count
0,29
1,29
2,17
3,24
4,3
...,...
3596,43
3597,25
3598,24
3599,24


In [None]:
df = pd.merge(ind_var_acc, disp[disp['type'] == 'OWNER'], how='left', on='account_id')
df = pd.merge(df, client, how='left', on='client_id')
df = df.rename(columns={'district_id_x':'bank_district_id',
                        'district_id_y':'client_district_id'})