# Financial Base Table

#### Check for library requirements & install if missing

In [1]:
import sys
import subprocess
import importlib

packages = ['pandas', 'numpy','pathlib','os','glob', 'textblob']
[subprocess.check_call(['pip', 'install', pkg]) 
for pkg in packages if not importlib.util.find_spec(pkg)]

[]

#### Import libraries

In [2]:
import pandas as pd
import numpy as np

from pathlib import Path
import pathlib
import os
import glob

from textblob import TextBlob

#### Set working directory

In [3]:
data_folder = Path(os.getcwd() + "/FP_GroupProject/data/raw/data_berka")

In [4]:
os.chdir(data_folder)

#### Read data

In [5]:
# Loop through data folder and add all df's in a dict
data_dict={}
for txt_file in glob.glob("*.asc"):
    #list filenames
    filename = txt_file
    #create python tablenames
    df_name = str.replace(txt_file,".asc","_raw") 
    #read files
    df_value = pd.read_csv(filename,  delimiter=";")
    #add to dict
    data_dict[df_name] = df_value


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
# Split dict into indivisual dataframes
for var in data_dict.keys():
    exec("{} = data_dict['{}']".format(var, var))

In [7]:
data_dict.keys()

dict_keys(['account_raw', 'card_raw', 'order_raw', 'disp_raw', 'loan_raw', 'client_raw', 'district_raw', 'trans_raw'])

In [8]:
# Check for null values
for keys in data_dict.keys():
    print(keys, ":", data_dict[keys].isna().sum().sum())

account_raw : 0
card_raw : 0
order_raw : 0
disp_raw : 0
loan_raw : 0
client_raw : 0
district_raw : 0
trans_raw : 2208738


### Data exploration

In [9]:
account_raw.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,POPLATEK MESICNE,930101
1,3818,74,POPLATEK MESICNE,930101
2,704,55,POPLATEK MESICNE,930101
3,2378,16,POPLATEK MESICNE,930101
4,2632,24,POPLATEK MESICNE,930102


In [10]:
# Translate frequency column to english
freq_dict = {}
for text in account_raw.frequency.unique():
    freq_dict[text] =  (TextBlob(text).translate(to='en').raw)

In [11]:
freq_dict

{'POPLATEK MESICNE': 'MONTHLY FEE',
 'POPLATEK PO OBRATU': 'TURNOVER FEE',
 'POPLATEK TYDNE': 'FEE OF THE WEEK'}

In [12]:
#add translated values in dataframe
account_raw.frequency.replace({"POPLATEK MESICNE": freq_dict["POPLATEK MESICNE"],
                                "POPLATEK PO OBRATU": freq_dict["POPLATEK PO OBRATU"],
                                "POPLATEK TYDNE": freq_dict["POPLATEK TYDNE"]}, inplace=True)

In [13]:
account_raw.head(15)

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,MONTHLY FEE,930101
1,3818,74,MONTHLY FEE,930101
2,704,55,MONTHLY FEE,930101
3,2378,16,MONTHLY FEE,930101
4,2632,24,MONTHLY FEE,930102
5,1972,77,MONTHLY FEE,930102
6,1539,1,TURNOVER FEE,930103
7,793,47,MONTHLY FEE,930103
8,2484,74,MONTHLY FEE,930103
9,1695,76,MONTHLY FEE,930103


In [14]:
# Create column year
account_raw['year'] = account_raw['date'].astype(str).str[:2].astype(int) + 1900
df = account_raw[account_raw['year'] < 1996].copy(deep=True)
df.head()

Unnamed: 0,account_id,district_id,frequency,date,year
0,576,55,MONTHLY FEE,930101,1993
1,3818,74,MONTHLY FEE,930101,1993
2,704,55,MONTHLY FEE,930101,1993
3,2378,16,MONTHLY FEE,930101,1993
4,2632,24,MONTHLY FEE,930102,1993


In [15]:
#Add LOR
df['lor'] = 1997 - df['year']
df.head()

Unnamed: 0,account_id,district_id,frequency,date,year,lor
0,576,55,MONTHLY FEE,930101,1993,4
1,3818,74,MONTHLY FEE,930101,1993,4
2,704,55,MONTHLY FEE,930101,1993,4
3,2378,16,MONTHLY FEE,930101,1993,4
4,2632,24,MONTHLY FEE,930102,1993,4


In [16]:
# Add information about account owner
df = pd.merge(df, disp_raw[disp_raw['type'] == 'OWNER'], how='left', on='account_id')
df = pd.merge(df, client_raw, how='left', on='client_id')
df = df.rename(columns={'district_id_x':'bank_district_id',
                        'district_id_y':'client_district_id'})
df.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,year,lor,disp_id,client_id,type,birth_number,client_district_id
0,576,55,MONTHLY FEE,930101,1993,4,692,692,OWNER,365111,74
1,3818,74,MONTHLY FEE,930101,1993,4,4601,4601,OWNER,350402,1
2,704,55,MONTHLY FEE,930101,1993,4,844,844,OWNER,450114,22
3,2378,16,MONTHLY FEE,930101,1993,4,2873,2873,OWNER,755324,16
4,2632,24,MONTHLY FEE,930102,1993,4,3177,3177,OWNER,380812,24


In [17]:
# Transform the birth day into year
df['birth_year'] = '19' + df['birth_number'].astype(str).str[:2]
df['birth_year'] = df['birth_year'].astype(int)

# Transform the birth day to day
df['birth_day'] = df['birth_number'].astype(str).str[-2:].astype(int)

# Extract the birth month
df['birth_month'] = df['birth_number'].astype(str).str[2:4].astype(int)

# Extract and correct the gender
df['gender'] = 'M'
df.loc[df['birth_month'] > 50, 'gender'] = 'F'

# Correct the birth month
df.loc[df['birth_month'] > 50, 'birth_month'] = df.loc[df['birth_month'] > 50, 'birth_month'] - 50
df.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,year,lor,disp_id,client_id,type,birth_number,client_district_id,birth_year,birth_day,birth_month,gender
0,576,55,MONTHLY FEE,930101,1993,4,692,692,OWNER,365111,74,1936,11,1,F
1,3818,74,MONTHLY FEE,930101,1993,4,4601,4601,OWNER,350402,1,1935,2,4,M
2,704,55,MONTHLY FEE,930101,1993,4,844,844,OWNER,450114,22,1945,14,1,M
3,2378,16,MONTHLY FEE,930101,1993,4,2873,2873,OWNER,755324,16,1975,24,3,F
4,2632,24,MONTHLY FEE,930102,1993,4,3177,3177,OWNER,380812,24,1938,12,8,M


In [20]:
# Age
df['age'] = 1996 - df['birth_year']

# Age group
df['age_group'] = df['age'] // 10 * 10
df.head()

Unnamed: 0,account_id,bank_district_id,frequency,date,year,lor,disp_id,client_id,type,birth_number,client_district_id,birth_year,birth_day,birth_month,gender,age,age_group
0,576,55,MONTHLY FEE,930101,1993,4,692,692,OWNER,365111,74,1936,11,1,F,60,60
1,3818,74,MONTHLY FEE,930101,1993,4,4601,4601,OWNER,350402,1,1935,2,4,M,61,60
2,704,55,MONTHLY FEE,930101,1993,4,844,844,OWNER,450114,22,1945,14,1,M,51,50
3,2378,16,MONTHLY FEE,930101,1993,4,2873,2873,OWNER,755324,16,1975,24,3,F,21,20
4,2632,24,MONTHLY FEE,930102,1993,4,3177,3177,OWNER,380812,24,1938,12,8,M,58,50
