# Financial Base Table

#### Check for library requirements & install if missing

In [47]:
import sys
import subprocess
import importlib

packages = ['pandas', 'numpy','pathlib','os','glob', 'textblob']
[subprocess.check_call(['pip', 'install', pkg]) 
for pkg in packages if not importlib.util.find_spec(pkg)]

[]

#### Import libraries

In [48]:
import pandas as pd
import numpy as np

from pathlib import Path
import pathlib
import os
import glob

from textblob import TextBlob

#### Set working directory

In [49]:
data_folder = Path(os.getcwd() + "/FP_GroupProject/data/raw/data_berka")

In [50]:
# Change to working dir
os.chdir(data_folder)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/inder/Dropbox/My Mac (Inders-MacBook-Pro.local)/Documents/GitHub/fin_datamart/FP_GroupProject/data/raw/data_berka/FP_GroupProject/data/raw/data_berka'

#### Read data

In [None]:
# Loop through data folder and add all df's in a dict
data_dict={}
for txt_file in glob.glob("*.asc"):
    #list filenames
    filename = txt_file
    #create python tablenames
    df_name = str.replace(txt_file,".asc","_raw") 
    #read files
    df_value = pd.read_csv(filename,  delimiter=";")
    #add to dict
    data_dict[df_name] = df_value


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# Split dict into indivisual dataframes
for var in data_dict.keys():
    exec("{} = data_dict['{}']".format(var, var))

In [None]:
data_dict.keys()

dict_keys(['account_raw', 'card_raw', 'order_raw', 'disp_raw', 'loan_raw', 'client_raw', 'district_raw', 'trans_raw'])

In [None]:
# Check for null values
for keys in data_dict.keys():
    print(keys, ":", data_dict[keys].isna().sum().sum())

account_raw : 0
card_raw : 0
order_raw : 0
disp_raw : 0
loan_raw : 0
client_raw : 0
district_raw : 0
trans_raw : 2208738


### Data exploration

In [None]:
account_raw.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,POPLATEK MESICNE,930101
1,3818,74,POPLATEK MESICNE,930101
2,704,55,POPLATEK MESICNE,930101
3,2378,16,POPLATEK MESICNE,930101
4,2632,24,POPLATEK MESICNE,930102


In [None]:
# Translate frequency column to english
freq_dict = {}
for text in account_raw.frequency.unique():
    freq_dict[text] =  (TextBlob(text).translate(to='en').raw)

In [None]:
freq_dict

{'POPLATEK MESICNE': 'MONTHLY FEE',
 'POPLATEK PO OBRATU': 'TURNOVER FEE',
 'POPLATEK TYDNE': 'FEE OF THE WEEK'}

In [None]:
#add translated values in dataframe
account_raw.frequency.replace({"POPLATEK MESICNE": freq_dict["POPLATEK MESICNE"],
                                "POPLATEK PO OBRATU": freq_dict["POPLATEK PO OBRATU"],
                                "POPLATEK TYDNE": freq_dict["POPLATEK TYDNE"]}, inplace=True)

In [None]:
account_raw.head(15)

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,MONTHLY FEE,930101
1,3818,74,MONTHLY FEE,930101
2,704,55,MONTHLY FEE,930101
3,2378,16,MONTHLY FEE,930101
4,2632,24,MONTHLY FEE,930102
5,1972,77,MONTHLY FEE,930102
6,1539,1,TURNOVER FEE,930103
7,793,47,MONTHLY FEE,930103
8,2484,74,MONTHLY FEE,930103
9,1695,76,MONTHLY FEE,930103


In [None]:
#Convert date format
account_raw["date"] = pd.to_datetime("19" + account_raw["date"].astype(str), format='%Y%m%d')

In [None]:
#Split year, month and day
account_raw['acc_open_year'] = pd.DatetimeIndex(account_raw['date']).year
account_raw['acc_open_month'] = pd.DatetimeIndex(account_raw['date']).month
account_raw['acc_open_day'] = pd.DatetimeIndex(account_raw['date']).day

In [None]:
account = account_raw[account_raw['acc_open_year'] < 1996].copy(deep=True)

In [None]:
account.acc_open_year.unique()

array([1993, 1994, 1995])

In [None]:
#Add LOR
account['LOR'] = 1996 - account['acc_open_year']
account.head()

Unnamed: 0,account_id,district_id,frequency,date,acc_open_year,acc_open_month,acc_open_day,LOR
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3
1,3818,74,MONTHLY FEE,1993-01-01,1993,1,1,3
2,704,55,MONTHLY FEE,1993-01-01,1993,1,1,3
3,2378,16,MONTHLY FEE,1993-01-01,1993,1,1,3
4,2632,24,MONTHLY FEE,1993-01-02,1993,1,2,3


In [None]:
# Add information about account owner
account = pd.merge(account, disp_raw[disp_raw['type'] == 'OWNER'], how='left', on='account_id')
account = pd.merge(account, client_raw, how='left', on='client_id')
account = account.rename(columns={'district_id_x':'bank_district_id',
                        'district_id_y':'client_district_id'})
account.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,birth_number,client_district_id
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,OWNER,365111,74
1,3818,74,MONTHLY FEE,1993-01-01,1993,1,1,3,4601,4601,OWNER,350402,1
2,704,55,MONTHLY FEE,1993-01-01,1993,1,1,3,844,844,OWNER,450114,22
3,2378,16,MONTHLY FEE,1993-01-01,1993,1,1,3,2873,2873,OWNER,755324,16
4,2632,24,MONTHLY FEE,1993-01-02,1993,1,2,3,3177,3177,OWNER,380812,24


In [None]:
# Transform the birth day into year
account['birth_year'] = '19' + account['birth_number'].astype(str).str[:2]
account['birth_year'] = account['birth_year'].astype(int)

# Transform the birth day to day
account['birth_day'] = account['birth_number'].astype(str).str[-2:].astype(int)

# Extract the birth month
account['birth_month'] = account['birth_number'].astype(str).str[2:4].astype(int)

In [None]:
# Extract and correct the gender
account['gender'] = 'M'
account.loc[account['birth_month'] > 50, 'gender'] = 'F'

# Correct the birth month
account.loc[account['birth_month'] > 50, 'birth_month'] = account.loc[account['birth_month'] > 50, 'birth_month'] - 50
account.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,birth_number,client_district_id,birth_year,birth_day,birth_month,gender
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,OWNER,365111,74,1936,11,1,F
1,3818,74,MONTHLY FEE,1993-01-01,1993,1,1,3,4601,4601,OWNER,350402,1,1935,2,4,M
2,704,55,MONTHLY FEE,1993-01-01,1993,1,1,3,844,844,OWNER,450114,22,1945,14,1,M
3,2378,16,MONTHLY FEE,1993-01-01,1993,1,1,3,2873,2873,OWNER,755324,16,1975,24,3,F
4,2632,24,MONTHLY FEE,1993-01-02,1993,1,2,3,3177,3177,OWNER,380812,24,1938,12,8,M


In [None]:
# Age
account['age'] = 1996 - account['birth_year']

# Age group
account['age_group'] = account['age'] // 10 * 10
account.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,birth_number,client_district_id,birth_year,birth_day,birth_month,gender,age,age_group
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,OWNER,365111,74,1936,11,1,F,60,60
1,3818,74,MONTHLY FEE,1993-01-01,1993,1,1,3,4601,4601,OWNER,350402,1,1935,2,4,M,61,60
2,704,55,MONTHLY FEE,1993-01-01,1993,1,1,3,844,844,OWNER,450114,22,1945,14,1,M,51,50
3,2378,16,MONTHLY FEE,1993-01-01,1993,1,1,3,2873,2873,OWNER,755324,16,1975,24,3,F,21,20
4,2632,24,MONTHLY FEE,1993-01-02,1993,1,2,3,3177,3177,OWNER,380812,24,1938,12,8,M,58,50


In [None]:
#Drop unwanted columns
account_dropped_cols = ["birth_number","date"]
account.drop(account_dropped_cols, axis=1)

Unnamed: 0,account_id,bank_district_id,frequency,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,client_district_id,birth_year,birth_day,birth_month,gender,age,age_group
0,576,55,MONTHLY FEE,1993,1,1,3,692,692,OWNER,74,1936,11,1,F,60,60
1,3818,74,MONTHLY FEE,1993,1,1,3,4601,4601,OWNER,1,1935,2,4,M,61,60
2,704,55,MONTHLY FEE,1993,1,1,3,844,844,OWNER,22,1945,14,1,M,51,50
3,2378,16,MONTHLY FEE,1993,1,1,3,2873,2873,OWNER,16,1975,24,3,F,21,20
4,2632,24,MONTHLY FEE,1993,1,2,3,3177,3177,OWNER,24,1938,12,8,M,58,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2234,4462,73,FEE OF THE WEEK,1995,12,27,1,5384,5384,OWNER,73,1935,21,7,M,61,60
2235,3814,74,MONTHLY FEE,1995,12,27,1,4596,4596,OWNER,74,1973,31,8,F,23,20
2236,2780,63,MONTHLY FEE,1995,12,29,1,3357,3357,OWNER,63,1954,21,7,F,42,40
2237,3273,74,MONTHLY FEE,1995,12,29,1,3962,3962,OWNER,74,1952,28,11,M,44,40


In [None]:
#Add age group description column
age_bkt = [
    (account["age"].between(0, 17)),
    (account["age"].between(18, 35)),
    (account["age"].between(36, 55)),
    (account["age"] > 56)
    ]

# create a list of the value for each condition
age_bkt_vals = ["youth", "young adult", "adult", "senior"]

In [None]:
account["age_grp_desc"] = np.select(age_bkt, age_bkt_vals)
account.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,birth_number,client_district_id,birth_year,birth_day,birth_month,gender,age,age_group,age_grp_desc
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,OWNER,365111,74,1936,11,1,F,60,60,senior
1,3818,74,MONTHLY FEE,1993-01-01,1993,1,1,3,4601,4601,OWNER,350402,1,1935,2,4,M,61,60,senior
2,704,55,MONTHLY FEE,1993-01-01,1993,1,1,3,844,844,OWNER,450114,22,1945,14,1,M,51,50,adult
3,2378,16,MONTHLY FEE,1993-01-01,1993,1,1,3,2873,2873,OWNER,755324,16,1975,24,3,F,21,20,young adult
4,2632,24,MONTHLY FEE,1993-01-02,1993,1,2,3,3177,3177,OWNER,380812,24,1938,12,8,M,58,50,senior


### Transaction table

In [None]:
trans_raw

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,930101,PRIJEM,VKLAD,1000.0,1000.0,,,
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.0,,,
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.0,,,
...,...,...,...,...,...,...,...,...,...,...
1056315,3626622,2906,981231,PRIJEM,,62.3,13729.4,UROK,,
1056316,3627616,2935,981231,PRIJEM,,81.3,19544.9,UROK,,
1056317,3625403,2869,981231,PRIJEM,,60.2,14638.2,UROK,,
1056318,3626683,2907,981231,PRIJEM,,107.5,23453.0,UROK,,


In [None]:
trans_raw.isna().sum()

trans_id           0
account_id         0
date               0
type               0
operation     183114
amount             0
balance            0
k_symbol      481881
bank          782812
account       760931
dtype: int64

In [None]:
#Add date columns
trans_raw['date'] = '19' + trans_raw['date'].astype(str)
trans_raw['date'] = pd.to_datetime(trans_raw['date'], format='%Y%m%d').dt.strftime("%Y-%m-%d")
trans_raw['trans_yr'] = trans_raw['date'].str[:4]
trans_raw['trans_mth'] = trans_raw['date'].str[5:7]
trans_raw['trans_day'] = trans_raw['date'].str[8:]

In [None]:
# create a dictionary to identify credit and withdrawl
type_dict = {'PRIJEM':'credit','VYDAJ':'withdrawl', 'VYBER':'withdrawl' }

In [None]:
#map dictionary to base dataframe for credit and withdrawl
trans_raw['trans_type_en'] = trans_raw['type'].map(type_dict)

In [None]:
#check if any olumn of trans doesn't have values that should be in Operations
print(trans_raw[trans_raw['type'] == 'VYBER']['operation'].unique())
print(trans_raw[trans_raw['type'] == 'PREVOD NA UCET']['operation'].unique())
print(trans_raw[trans_raw['type'] == "PREVOD Z UCTU"]['operation'].unique())
print(trans_raw[trans_raw['type'] == 'VKLAD']['operation'].unique())
print(trans_raw[trans_raw['type'] == "VYBER KARTOU"]['operation'].unique())

['VYBER']
[]
[]
[]
[]


In [None]:
# create a dictionary to identify credit and withdrawl
type_dict = {'PRIJEM':'credit','VYDAJ':'withdrawl', 'VYBER':'withdrawl' }

#map dictionary to base dataframe for credit and withdrawl
trans_raw['trans_type_en'] = trans_raw['type'].map(type_dict)

##Replace values for OPerations columns as well
trans_raw.loc[trans_raw['operation'] == 'VYBER KARTOU','operation'] = 'cc withdrawal'
trans_raw.loc[trans_raw['operation'] == 'VKLAD','operation'] = 'credit in cash'
trans_raw.loc[trans_raw['operation'] == 'PREVOD Z UCTU','operation'] = 'collection from another bank'
trans_raw.loc[trans_raw['operation'] == 'VYDAJ','operation'] = 'Withdrawal in cash'
trans_raw.loc[trans_raw['operation'] == 'PREVOD NA UCET','operation'] = 'remittance to another bank'

In [None]:
#check if there is any missing value in operation by relating it with the type
print(trans_raw[trans_raw['type'] == 'withdrawal']['operation'].isna().sum())
print(trans_raw[trans_raw['type'] == "credit"]['operation'].isna().sum())

0
0


In [None]:
#Replace the values for k_symbol column as well
trans_raw.loc[trans_raw.k_symbol.isna(), 'k_symbol is NA'] = 'Missing'
trans_raw.loc[trans_raw['k_symbol is NA'].isna(), 'k_symbol is NA'] = 'No'

trans_raw.loc[trans_raw['k_symbol'] == "POJISTNE", "k_symbol"]= 'insurance'
trans_raw.loc[trans_raw['k_symbol'] == "SLUZBY", "k_symbol"]= 'payment for statement'
trans_raw.loc[trans_raw['k_symbol'] == "UROK", "k_symbol"]= 'interest credited'
trans_raw.loc[trans_raw['k_symbol'] == "SANKC. UROK", "k_symbol"]= 'sanctions'
trans_raw.loc[trans_raw['k_symbol'] == "SIPO", "k_symbol"]= 'household' 
trans_raw.loc[trans_raw['k_symbol'] == "DUCHOD", "k_symbol"]= 'old age pension'
trans_raw.loc[trans_raw['k_symbol'] == "UVER", "k_symbol"]= 'loan payment'

#Making sure that there are no wrong values
trans_raw.loc[trans_raw.k_symbol.isna(), 'k_symbol'] = None
trans_raw.loc[trans_raw.k_symbol == ' ', 'k_symbol'] = None

print(trans_raw.k_symbol.unique())
print(' ')

print(trans_raw[trans_raw['type'] == 'withdrawal']['k_symbol'].isna().sum())
print(trans_raw[trans_raw['type'] == "credit"]['k_symbol'].isna().sum())

[None 'old age pension' 'interest credited' 'household'
 'payment for statement' 'insurance' 'sanctions' 'loan payment']
 
0
0


In [None]:
trans_raw

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,trans_yr,trans_mth,trans_day,trans_type_en,k_symbol is NA
0,695247,2378,1993-01-01,PRIJEM,credit in cash,700.0,700.0,,,,1993,01,01,credit,Missing
1,171812,576,1993-01-01,PRIJEM,credit in cash,900.0,900.0,,,,1993,01,01,credit,Missing
2,207264,704,1993-01-01,PRIJEM,credit in cash,1000.0,1000.0,,,,1993,01,01,credit,Missing
3,1117247,3818,1993-01-01,PRIJEM,credit in cash,600.0,600.0,,,,1993,01,01,credit,Missing
4,579373,1972,1993-01-02,PRIJEM,credit in cash,400.0,400.0,,,,1993,01,02,credit,Missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,3626622,2906,1998-12-31,PRIJEM,,62.3,13729.4,interest credited,,,1998,12,31,credit,No
1056316,3627616,2935,1998-12-31,PRIJEM,,81.3,19544.9,interest credited,,,1998,12,31,credit,No
1056317,3625403,2869,1998-12-31,PRIJEM,,60.2,14638.2,interest credited,,,1998,12,31,credit,No
1056318,3626683,2907,1998-12-31,PRIJEM,,107.5,23453.0,interest credited,,,1998,12,31,credit,No


In [None]:
#check  outliers in the amount
trans_raw.amount.quantile([0.25,0.5,0.75]) #1st quartile is 135.9 and third is 6800$

quart1 = pd.DataFrame(trans_raw.amount.quantile([0.25,0.5,0.75])).loc[0.25,] / 2
quart3 = 2 * pd.DataFrame(trans_raw.amount.quantile([0.25,0.5,0.75])).loc[0.75,:]

print(quart1)
print(quart3)

amount    67.95
Name: 0.25, dtype: float64
amount    13600.0
Name: 0.75, dtype: float64


In [None]:
#add a column outlier where we specify rows that we want to update
trans_raw.loc[(trans_raw.amount > 13600) | (trans_raw.amount < 67.95),'outlier'] = 'Yes'
trans_raw.loc[trans_raw.outlier.isna(),'outlier'] = 'No'

# if considering outliers as 2 * 3rd Quartiles, then  replace the outliers by 2 * 3rd quartile
trans_raw.loc[trans_raw.amount > 13600,'amount'] = 13600

# if considering outliers as 1st Quartiles / 2, then replace the outliers by 1st quartile/2
trans_raw.loc[trans_raw.amount < 67.95 ,'amount'] = 67.95

In [None]:
b = pd.DataFrame(trans_raw[['bank','account']])
b.loc[b.account.notna(),'bank'].isna().sum()

21881

In [None]:
trans_raw.loc[trans_raw.bank.isna(),'Bank is na'] = 'Missing'
trans_raw.loc[trans_raw['Bank is na'].isna(),'Bank is na'] = 'No'

trans_raw.loc[trans_raw.bank.isna(),'bank'] = 'Other'

trans_raw.loc[trans_raw.account.isna(),'Account is na'] = 'Missing'
trans_raw.loc[trans_raw['Account is na'].isna(),'Account is na'] = 'No'

trans_raw.loc[trans_raw.account.isna(),'account'] = 'Other'
trans_raw.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,trans_yr,trans_mth,trans_day,trans_type_en,k_symbol is NA,outlier,Bank is na,Account is na
0,695247,2378,1993-01-01,PRIJEM,credit in cash,700.0,700.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing
1,171812,576,1993-01-01,PRIJEM,credit in cash,900.0,900.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing
2,207264,704,1993-01-01,PRIJEM,credit in cash,1000.0,1000.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing
3,1117247,3818,1993-01-01,PRIJEM,credit in cash,600.0,600.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing
4,579373,1972,1993-01-02,PRIJEM,credit in cash,400.0,400.0,,Other,Other,1993,1,2,credit,Missing,No,Missing,Missing


In [None]:
# replace null of trans table per customer for the joins

Ntrans = trans_raw.pivot_table(index = 'account_id',
                          columns = 'k_symbol',
                          values = 'amount',
                          aggfunc = sum)
Ntrans = Ntrans.fillna(0)
Ntrans.head()

k_symbol,household,insurance,interest credited,loan payment,old age pension,payment for statement,sanctions
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,98080.0,0.0,3310.25,0.0,0.0,2718.0,0.0
2,472290.0,0.0,10771.95,80944.8,0.0,4416.75,0.0
3,14755.0,46007.0,2125.0,0.0,0.0,883.35,0.0
4,60262.0,0.0,2756.25,0.0,188802.0,1970.55,0.0
5,37352.0,0.0,1626.45,0.0,95323.0,951.3,0.0


In [None]:
# Select on transactions for 1996
txn_96 = trans_raw[trans_raw['date'].astype(str).str[:2].isin(['96'])]

In [None]:
trans_raw

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,trans_yr,trans_mth,trans_day,trans_type_en,k_symbol is NA,outlier,Bank is na,Account is na
0,695247,2378,1993-01-01,PRIJEM,credit in cash,700.00,700.0,,Other,Other,1993,01,01,credit,Missing,No,Missing,Missing
1,171812,576,1993-01-01,PRIJEM,credit in cash,900.00,900.0,,Other,Other,1993,01,01,credit,Missing,No,Missing,Missing
2,207264,704,1993-01-01,PRIJEM,credit in cash,1000.00,1000.0,,Other,Other,1993,01,01,credit,Missing,No,Missing,Missing
3,1117247,3818,1993-01-01,PRIJEM,credit in cash,600.00,600.0,,Other,Other,1993,01,01,credit,Missing,No,Missing,Missing
4,579373,1972,1993-01-02,PRIJEM,credit in cash,400.00,400.0,,Other,Other,1993,01,02,credit,Missing,No,Missing,Missing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1056315,3626622,2906,1998-12-31,PRIJEM,,67.95,13729.4,interest credited,Other,Other,1998,12,31,credit,No,Yes,Missing,Missing
1056316,3627616,2935,1998-12-31,PRIJEM,,81.30,19544.9,interest credited,Other,Other,1998,12,31,credit,No,No,Missing,Missing
1056317,3625403,2869,1998-12-31,PRIJEM,,67.95,14638.2,interest credited,Other,Other,1998,12,31,credit,No,Yes,Missing,Missing
1056318,3626683,2907,1998-12-31,PRIJEM,,107.50,23453.0,interest credited,Other,Other,1998,12,31,credit,No,No,Missing,Missing


In [None]:
# Aggregate the total credit per account
trans_agg_credit = txn_96[txn_96['type'].isin(['PRIJEM'])].groupby('account_id')['amount'].agg('sum')
trans_agg_credit = trans_agg_credit.reset_index()
trans_agg_credit = trans_agg_credit.rename(columns={'amount':'total_credit'})


# Aggregate the total withdrawal per account
trans_agg_withdrawal = txn_96[txn_96['type'].isin(['VYDAJ', 'VYBER'])].groupby('account_id')['amount'].agg('sum')
trans_agg_withdrawal = trans_agg_withdrawal.reset_index()
trans_agg_withdrawal = trans_agg_withdrawal.rename(columns={'amount':'total_withdrawal'})


# Join with previous table
trans = pd.merge(trans_raw, trans_agg_credit, how='left', on='account_id')
trans = pd.merge(trans, trans_agg_withdrawal, how='left', on='account_id')
trans.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,trans_yr,trans_mth,trans_day,trans_type_en,k_symbol is NA,outlier,Bank is na,Account is na,total_credit,total_withdrawal
0,695247,2378,1993-01-01,PRIJEM,credit in cash,700.0,700.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing,,
1,171812,576,1993-01-01,PRIJEM,credit in cash,900.0,900.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing,,
2,207264,704,1993-01-01,PRIJEM,credit in cash,1000.0,1000.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing,,
3,1117247,3818,1993-01-01,PRIJEM,credit in cash,600.0,600.0,,Other,Other,1993,1,1,credit,Missing,No,Missing,Missing,,
4,579373,1972,1993-01-02,PRIJEM,credit in cash,400.0,400.0,,Other,Other,1993,1,2,credit,Missing,No,Missing,Missing,,


In [51]:
#create summarized trannsaction type columns
trans = trans_raw.drop(columns = ['trans_yr', 'trans_mth', 'trans_day', 'type'])

trans['First Transaction'] = trans['date']
trans['Last Transaction'] = trans['date']


trans['credits'] = 0
trans['withdrawals'] = 0

trans.loc[trans['trans_type_en'] == 'credit', 'credits'] = 1
trans.loc[trans['trans_type_en'] == 'withdrawal', 'withdrawals'] = 1

In [None]:
trans["cash_credit"] = 0
trans["bank_transfer_credit"] = 0
trans["withdrawal_cash"] = 0
trans["unknown"] = 0
trans["withdrawal_remittance_bank_transfer"] = 0
trans["withdrawal_credit_card"] = 0

trans.loc[trans['Operation Type'] == 'Credit - Cash', 'Credit - Cash'] = trans.loc[trans['Operation Type'] == 'Credit - Cash','Transaction Amount']
trans.loc[trans['Operation Type'] == 'Credit - Bank Transfer', 'Credit - Bank Transfer'] = trans.loc[trans['Operation Type'] == 'Credit - Bank Transfer','Transaction Amount']
trans.loc[trans['Operation Type'] == 'Withdrawal - Cash', 'Withdrawal - Cash'] = trans.loc[trans['Operation Type'] == 'Withdrawal - Cash','Transaction Amount']
trans.loc[trans['Operation Type'] == 'Unknown', 'Unknown'] = trans.loc[trans['Operation Type'] == 'Unknown','Transaction Amount']
trans.loc[trans['Operation Type'] == 'Withdrawal - Remmitance Bank Transfer', 'Withdrawal - Remittance Bank Transfer'] = trans.loc[trans['Operation Type'] == 'Withdrawal - Remmitance Bank Transfer','Transaction Amount']
trans.loc[trans['Operation Type'] == 'Withdrawal - Credit card', 'Withdrawal - Credit card'] = trans.loc[trans['Operation Type'] == 'Withdrawal - Credit card','Transaction Amount']



trans = trans.drop(columns = ['Date of Transaction','Transaction Type', 'Operation Type'])

In [57]:
pd.merge(account[account["account_id"]==576], trans, how='left', on='account_id')

Unnamed: 0,account_id,bank_district_id,frequency,date_x,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,...,k_symbol,bank,account,trans_type_en,k_symbol is NA,outlier,Bank is na,Account is na,total_credit,total_withdrawal
0,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,,Other,Other,credit,Missing,No,Missing,Missing,76097.3,71059.4
1,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,old age pension,YZ,30300313.0,credit,No,No,No,No,76097.3,71059.4
2,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,interest credited,Other,Other,credit,No,Yes,Missing,Missing,76097.3,71059.4
3,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,old age pension,YZ,30300313.0,credit,No,No,No,No,76097.3,71059.4
4,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,interest credited,Other,Other,credit,No,Yes,Missing,Missing,76097.3,71059.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,,Other,Other,withdrawl,Missing,No,Missing,Missing,76097.3,71059.4
354,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,old age pension,YZ,30300313.0,credit,No,No,No,No,76097.3,71059.4
355,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,household,OP,71033382.0,withdrawl,No,No,No,No,76097.3,71059.4
356,576,55,MONTHLY FEE,1993-01-01,1993,1,1,3,692,692,...,,Other,Other,withdrawl,Missing,No,Missing,Missing,76097.3,71059.4


In [58]:
#Drop unwanted columns
account_dropped_cols = ["birth_number","date"]
account.drop(account_dropped_cols, axis=1)

Unnamed: 0,account_id,bank_district_id,frequency,acc_open_year,acc_open_month,acc_open_day,LOR,disp_id,client_id,type,client_district_id,birth_year,birth_day,birth_month,gender,age,age_group,age_grp_desc
0,576,55,MONTHLY FEE,1993,1,1,3,692,692,OWNER,74,1936,11,1,F,60,60,senior
1,3818,74,MONTHLY FEE,1993,1,1,3,4601,4601,OWNER,1,1935,2,4,M,61,60,senior
2,704,55,MONTHLY FEE,1993,1,1,3,844,844,OWNER,22,1945,14,1,M,51,50,adult
3,2378,16,MONTHLY FEE,1993,1,1,3,2873,2873,OWNER,16,1975,24,3,F,21,20,young adult
4,2632,24,MONTHLY FEE,1993,1,2,3,3177,3177,OWNER,24,1938,12,8,M,58,50,senior
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2234,4462,73,FEE OF THE WEEK,1995,12,27,1,5384,5384,OWNER,73,1935,21,7,M,61,60,senior
2235,3814,74,MONTHLY FEE,1995,12,27,1,4596,4596,OWNER,74,1973,31,8,F,23,20,young adult
2236,2780,63,MONTHLY FEE,1995,12,29,1,3357,3357,OWNER,63,1954,21,7,F,42,40,adult
2237,3273,74,MONTHLY FEE,1995,12,29,1,3962,3962,OWNER,74,1952,28,11,M,44,40,adult
