In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
os.getcwd()

'C:\\Users\\iaros\\My_documents\\Education\\projects\\iaros_data_berka_analysis\\notebooks'

In [4]:
os.chdir('C:\\Users\\iaros\\My_documents\\Education\\projects\\iaros_data_berka_analysis\\')

In [5]:
os.listdir('data/raw_data')

['account.asc',
 'card.asc',
 'client.asc',
 'disp.asc',
 'district.asc',
 'loan.asc',
 'order.asc',
 'trans.asc']

## Таблица account

| item        | meaning                             | remark                                                                                                                                                              |
| ----------- | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| account_id  | identification of the account       |                                                                                                                                                                     |
| district_id | location of the branch              |                                                                                                                                                                     |
| date        | date of creating of the account     | in the form YYMMDD                                                                                                                                                  |
| frequency   | frequency of issuance of statements | "POPLATEK MESICNE" stands for monthly issuance<br><br>"POPLATEK TYDNE" stands for weekly issuance<br><br>"POPLATEK PO OBRATU" stands for issuance after transaction |

***frequency of issuance of statements** - частота выдачи выписок по счету. "Ежемесячно", "Еженедельно", "После транзакции"

In [220]:
account_df = pd.read_csv('data/raw_data/account.asc', sep=';', dtype={'date':'str'})
account_df.head(3)

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,POPLATEK MESICNE,930101
1,3818,74,POPLATEK MESICNE,930101
2,704,55,POPLATEK MESICNE,930101


In [221]:
# изначальный размер датафрейма
account_initial_shape = account_df.shape

In [222]:
account_df.dtypes

account_id      int64
district_id     int64
frequency      object
date           object
dtype: object

In [223]:
# меняем тип date на datetime64[ns]
account_df.date = pd.to_datetime(account_df.date, format='%y%m%d')
account_df.date.dt.year.unique()

array([1993, 1994, 1995, 1996, 1997], dtype=int32)

In [224]:
# Функция проверки на NaN в любом месте датафрейма
def check_for_nans(df: pd.DataFrame):
    rows_isna = df.isna().any(axis=1)
    
    if rows_isna.any():
        print("Following rows have missing values!")
        return df[rows_isna]
    else:
        print("No missing values")

In [225]:
# проверяем account_df на наличие NaN
check_for_nans(account_df)

No missing values


In [226]:
# фактические значения в колонке frequency
account_df.frequency.unique()

array(['POPLATEK MESICNE', 'POPLATEK PO OBRATU', 'POPLATEK TYDNE'],
      dtype=object)

In [227]:
# меняем значения в frequency с чесшского на англоязычные. Функция

freq_mapping = {'POPLATEK MESICNE':'monthly issuance', 'POPLATEK TYDNE':'weekly issuance', \
               'POPLATEK PO OBRATU':'issuance per transaction'}
account_df.frequency = account_df.frequency.replace(freq_mapping).fillna('unspecified')

account_df.frequency.unique()

array(['monthly issuance', 'issuance per transaction', 'weekly issuance'],
      dtype=object)

In [228]:
# функция проверки датафрейма после изменений
# df_name название датафрейма

def check_df_integrity(initial_df_shape: tuple[int, int], current_df: pd.DataFrame, df_name: str, axis: int | None = None):

    if not isinstance(df_name, str):
        raise TypeError(f'df_name must be a string (name of a DataFrame), but got {type(df_name)}')
        
    current_shape = current_df.shape
    axis_names = {0:'rows', 1:'columns'}
    
    
    if axis != None:
        initial_ax = initial_df_shape[axis]
        current_ax = current_shape[axis]
        other_axis = 1 - axis
        
    
        if initial_ax != current_ax:
            raise ValueError(f'''Something is wrong with {axis_names[axis]}!
            initial {axis_names[axis]} shape is: {initial_ax}
            current {axis_names[axis]} shape is: {current_ax}''')
            
        else:
            initial_other = initial_df_shape[other_axis]
            current_other = current_shape[other_axis]
            
            if current_other != initial_other:
                print(f'''{axis_names[axis]} count is equal.
But changes were made to {axis_names[other_axis]}
initial {axis_names[other_axis]} shape is: {initial_df_shape[other_axis]}
current {axis_names[other_axis]} shape is: {current_shape[other_axis]}''')
            else:
                print(f"{df_name} is intact!")

    else:
        if initial_df_shape != current_shape:
           raise ValueError(f'''Something is wrong!
           initial shape is: {initial_df_shape}
           current shape is: {current_shape}''')
        else:    
            print(f"{df_name} is intact!")

In [229]:
# проверка размеров датафрейма после изменений
check_df_integrity(account_initial_shape, account_df, 'account_df')

account_df is intact!


In [230]:
account_df.head(3)

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,monthly issuance,1993-01-01
1,3818,74,monthly issuance,1993-01-01
2,704,55,monthly issuance,1993-01-01


In [231]:
# функция сохранения датафрейма в csv

def df_to_csv(df: pd.DataFrame, df_name: str, directory: str = 'data/cleaned_data/', suffix: str = 'clean'):

    # проверка что аргумент df_name это строка
    if not isinstance(df_name, str):
        raise TypeError(f"df_name must be a string (name of the DataFrame), got {type(df_name)}")

    # Создать директорию если она не существует
    os.makedirs(directory, exist_ok=True)

    # название файла
    if suffix:
        filename = f"{df_name}_{suffix}.csv"
    else:
        filename = f"{df_name}.csv"
    # путь к файлу: директория + название файла
    filepath = os.path.join(directory, filename)

    # сохраняем датафрейм в csv, индекс не сохраняем
    df.to_csv(filepath, index=False)
    print(f"Saved cleaned DataFrame to {filepath}")

In [232]:
df_to_csv(account_df, 'account_df')

Saved cleaned DataFrame to data/cleaned_data/account_df_clean.csv


# Таблица client
| item         | meaning                  | remark                                                                                                                                   |
| ------------ | ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
| client_id    | record identifier        |                                                                                                                                          |
| birth number | identification of client | the number is in the form YYMMDD for men,  <br>the number is in the form YYMM+50DD for women,  <br><br>where YYMMDD is the date of birth |
| district_id  | address of the client    |                                                                                                                                          |

***YYMM+50DD for women** значит что для мужчины например 1972-04-01 будет 720401, а для женщины 725401. Нужно отфильтровать где средние два числа больше 12 и на этом условии создать колонку с полом; а также заменить две средние цифры для женщин - вычесть из них 50 или 5000 из всего числа


In [233]:
client_df = pd.read_csv(r'data/raw_data/client.asc', sep=';')

In [234]:
client_df.head(3)

Unnamed: 0,client_id,birth_number,district_id
0,1,706213,18
1,2,450204,1
2,3,406009,1


In [235]:
client_df.dtypes

client_id       int64
birth_number    int64
district_id     int64
dtype: object

In [236]:
# проверка на NaN в любой колонке
check_for_nans(client_df)

No missing values


In [237]:
client_initial_shape = client_df.shape
client_initial_shape

(5369, 3)

In [238]:
# функция определения пола по дате и возврата обычной даты для женщин, но всё еще в виде целого числа

def check_birth_num(birth_num):
    if birth_num // 100 % 100 > 50:
        return birth_num - 5000, 'female'
    elif birth_num // 100 % 100 <= 12:
        return birth_num, 'male'
    else:
        return 'invalid_date', 'unknown'

In [239]:
client_df['birth_date'], client_df['sex'] = zip(*client_df.birth_number.map(check_birth_num))

In [240]:
def parse_int_date(date: int):
    year = date // 10_000
    month = date % 10_000 // 100
    day = date % 10_000 % 100

    full_year = 1900 + year if year > 9 else 2000 + year
    return pd.to_datetime(f'{full_year}-{month}-{day}')
    


In [241]:
client_df['birth_date'] = client_df['birth_date'].map(parse_int_date)

In [242]:
client_df.head()

Unnamed: 0,client_id,birth_number,district_id,birth_date,sex
0,1,706213,18,1970-12-13,female
1,2,450204,1,1945-02-04,male
2,3,406009,1,1940-10-09,female
3,4,561201,5,1956-12-01,male
4,5,605703,5,1960-07-03,female


In [243]:
# Проверка что нет дат с 50+MM и полом "не женщина"
# filtered_df это датафрейм где первые две цифры birth_number отличаются если отнять 5000 и пол "female"
# то есть по ошибке мужчину отметили как женщину

filtered_df =  client_df[((client_df.birth_number - 5000) // 10000 != client_df.birth_number // 10000) \
          & (client_df.sex == 'female')]

if not filtered_df.empty:
    raise ValueError(f'''Wrong rows in client_df!
{filtered_df}''')
else:
    print('client_df is correct.')

client_df is correct.


In [244]:
client_df.dtypes

client_id                int64
birth_number             int64
district_id              int64
birth_date      datetime64[ns]
sex                     object
dtype: object

In [245]:
client_df = client_df.drop(columns='birth_number')
client_df.head(2)

Unnamed: 0,client_id,district_id,birth_date,sex
0,1,18,1970-12-13,female
1,2,1,1945-02-04,male


In [246]:
# Проверка целостности client_df по строкам после изменений
check_df_integrity(client_initial_shape, client_df, 'client_df', 0)

rows count is equal.
But changes were made to columns
initial columns shape is: 3
current columns shape is: 4


In [247]:
df_to_csv(client_df, 'client_df')

Saved cleaned DataFrame to data/cleaned_data/client_df_clean.csv


# Таблица disposition
| item       | meaning                          | remark                                                   |
| ---------- | -------------------------------- | -------------------------------------------------------- |
| disp_id    | record identifier                |                                                          |
| client_id  | identification of a client       |                                                          |
| account_id | identification of an account     |                                                          |
| type       | type of disposition (owner/user) | only owner can issue permanent orders and ask for a loan |

each record relates together a client with an account i.e. this relation describes the rights of clients to operate accounts

In [248]:
disposition_df = pd.read_csv('data/raw_data/disp.asc', sep=';')

In [249]:
disposition_df.head()

Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT


In [250]:
disposition_df.dtypes

disp_id        int64
client_id      int64
account_id     int64
type          object
dtype: object

In [251]:
disposition_df.type.unique()

array(['OWNER', 'DISPONENT'], dtype=object)

In [252]:
# проверка на NaN-ы
check_for_nans(disposition_df)

No missing values


In [253]:
disposition_initial_shape = disposition_df.shape
disposition_initial_shape

(5369, 4)

In [254]:
# import warnings

# warnings.warn("Что-то подозрительное, но программа идёт дальше", UserWarning)

In [255]:
df_to_csv(disposition_df, 'disposition_df')

Saved cleaned DataFrame to data/cleaned_data/disposition_df_clean.csv


# Таблица permanent order
| item       | meaning                          | remark                                                                                                                                                |
| ---------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| order_id   | record identifier                |                                                                                                                                                       |
| account_id | account, the order is issued for |                                                                                                                                                       |
| bank_to    | bank of the recipient            | each bank has unique two-letter code                                                                                                                  |
| account_to | account of the recipient         |                                                                                                                                                       |
| amount     | debited amount                   |                                                                                                                                                       |
| K_symbol   | characterization of the payment  | "POJISTNE" stands for insurrance payment<br><br>"SIPO" stands for household<br><br>"LEASING" stands for leasing<br><br>"UVER" stands for loan payment |

- each record describes characteristics of a payment order
- SIPO is a service provided by Česká pošta (Czech Post). It allows you to combine several regular direct debits into a single payment (e.g. rent, telephone, radio)
- Czech Post offers SIPO services as an agent collecting payments from individuals on behalf of legal or other entities under a SIPO agency contract. SIPO can be used to pay for rental, electricity, gas, water, radio and television subscription fees, cable television, newspaper and magazine subscription fees, building savings, life and other insurance, and other payments or fees


In [256]:
order_df = pd.read_csv('data/raw_data/order.asc', sep=';')
order_df.head()

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
0,29401,1,YZ,87144583,2452.0,SIPO
1,29402,2,ST,89597016,3372.7,UVER
2,29403,2,QR,13943797,7266.0,SIPO
3,29404,3,WX,83084338,1135.0,SIPO
4,29405,3,CD,24485939,327.0,


In [257]:
order_initial_shape = order_df.shape
order_initial_shape

(6471, 6)

In [258]:
# Проверка на NaN
check_for_nans(order_df)

No missing values


In [259]:
order_df.k_symbol.unique()

array(['SIPO', 'UVER', ' ', 'POJISTNE', 'LEASING'], dtype=object)

In [260]:
order_df.dtypes

order_id        int64
account_id      int64
bank_to        object
account_to      int64
amount        float64
k_symbol       object
dtype: object

In [261]:
# Функция замены значений в колонке k_symbol

def replace_k_symbol(k_symbol_val):
    if k_symbol_val == 'POJISTNE':
        return 'insurance payment'
    elif k_symbol_val == 'SIPO':
        return 'household'
    elif k_symbol_val == 'LEASING':
        return 'leasing'
    elif k_symbol_val == 'UVER':
        return 'loan payment'
    else:
        return 'unclassified' # для всех остальных значений

In [262]:
order_mapping = {'POJISTNE':'insurance payment', 'SIPO':'household', 'LEASING':'leasing', 'UVER':'loan payment', '':'unclassified'}

order_df.k_symbol = order_df.k_symbol.str.strip().replace(order_mapping)
order_df.k_symbol.unique()

array(['household', 'loan payment', 'unclassified', 'insurance payment',
       'leasing'], dtype=object)

In [263]:
order_df.head()

Unnamed: 0,order_id,account_id,bank_to,account_to,amount,k_symbol
0,29401,1,YZ,87144583,2452.0,household
1,29402,2,ST,89597016,3372.7,loan payment
2,29403,2,QR,13943797,7266.0,household
3,29404,3,WX,83084338,1135.0,household
4,29405,3,CD,24485939,327.0,unclassified


In [264]:
# Проверка целостности после изменений
check_df_integrity(order_initial_shape, order_df, 'order_df')

order_df is intact!


In [265]:
df_to_csv(order_df, 'order_df')

Saved cleaned DataFrame to data/cleaned_data/order_df_clean.csv


# Таблица Transaction
| item       | meaning                             | remark                                                                                                                                                                                                                                                                                                              |
| ---------- | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| trans_id   | record identifier                   |                                                                                                                                                                                                                                                                                                                     |
| account_id | account, the transation deals with  |                                                                                                                                                                                                                                                                                                                     |
| date       | date of transaction                 | in the form YYMMDD                                                                                                                                                                                                                                                                                                  |
| type       | +/- transaction                     | "PRIJEM" stands for credit<br><br>"VYDAJ" stands for withdrawal                                                                                                                                                                                                                                                     |
| operation  | mode of transaction                 | "VYBER KARTOU" credit card withdrawal<br><br>"VKLAD" credit in cash<br><br>"PREVOD Z UCTU" collection from another bank<br><br>"VYBER" withdrawal in cash<br><br>"PREVOD NA UCET" remittance to another bank                                                                                                        |
| amount     | amount of money                     |                                                                                                                                                                                                                                                                                                                     |
| balance    | balance after transaction           |                                                                                                                                                                                                                                                                                                                     |
| k_symbol   | characterization of the transaction | "POJISTNE" stands for insurrance payment<br><br>"SLUZBY" stands for payment for statement<br><br>"UROK" stands for interest credited<br><br>"SANKC. UROK" sanction interest if negative balance<br><br>"SIPO" stands for household<br><br>"DUCHOD" stands for old-age pension<br><br>"UVER" stands for loan payment |
| bank       | bank of the partner                 | each bank has unique two-letter code                                                                                                                                                                                                                                                                                |
| account    | account of the partner              |                                                                                                                                                                                                                                                                                                                     |

In [266]:
transaction_df = pd.read_csv('data/raw_data/trans.asc', sep=';', \
                             dtype={'trans_id':'str','account_id':'str','bank':'str', 'account':'str'})

In [267]:
transaction_df.dtypes

trans_id       object
account_id     object
date            int64
type           object
operation      object
amount        float64
balance       float64
k_symbol       object
bank           object
account        object
dtype: object

In [268]:
transaction_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,930101,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,930101,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,930101,PRIJEM,VKLAD,1000.0,1000.0,,,
3,1117247,3818,930101,PRIJEM,VKLAD,600.0,600.0,,,
4,579373,1972,930102,PRIJEM,VKLAD,400.0,400.0,,,


In [269]:
transaction_df.isna().sum()

trans_id           0
account_id         0
date               0
type               0
operation     183114
amount             0
balance            0
k_symbol      481881
bank          782812
account       760931
dtype: int64

In [270]:
trans_initial_shape = transaction_df.shape

In [271]:
# переводим целые числа в date в формат даты.
transaction_df['date'] = pd.to_datetime(transaction_df['date'], format='%y%m%d')
transaction_df.head(3)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,1993-01-01,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,1993-01-01,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,1993-01-01,PRIJEM,VKLAD,1000.0,1000.0,,,


In [272]:
# проверка значений в колонке type
transaction_df.type.unique()

array(['PRIJEM', 'VYDAJ', 'VYBER'], dtype=object)

In [273]:
# видно что в type есть значение VYBER, которого там по аннотации к данным быть не должно. Посмотрим количество VYBER в колонке type
transaction_df[transaction_df.type == 'VYBER'].type.count()

np.int64(16666)

In [274]:
# копирую датафрейм для дальнейшего удобства при случае если надо откатить датафрейм к состоянию до изменений
transaction_df_copy = transaction_df.copy()

In [275]:
# В этой ячейке мы создаем датафрейм с транзакциями аккаунтов у которых есть type == 'VYBER'.
# Чтобы потом посмотреть где предыдущий баланс и текущий баланс на момент транзакции с type == 'VYBER' сходятся, а где нет.
# И в связи с этим поменять VYBER на VYDAJ либо на что-то типа uknown или unidentified

# фильтрация аккаунтов где тип тразакции VYBER
vyber_accounts = transaction_df_copy[transaction_df_copy.type == 'VYBER'].account_id.unique()

# Сортируем по account_id и trans_id для дальнейшего создания колонки с балансом на момент предыдущей транзакции
vyber_acc_trans = transaction_df_copy.loc[transaction_df_copy.account_id.isin(vyber_accounts)].sort_values(by=['account_id','trans_id'])

# Создаем колонку с балансом на предыдущую транзакцию
vyber_acc_shift = vyber_acc_trans.groupby('account_id', as_index=False).balance.shift(1)
vyber_acc_trans['prev_balance'] = vyber_acc_shift

# ставим prev_balance рядом с balance
vyber_acc_trans = vyber_acc_trans.loc[:,['trans_id', 'account_id', 'date', 'type', 'operation', 'amount', \
       'balance','prev_balance', 'k_symbol', 'bank', 'account']]

In [276]:
vyber_acc_trans.shape

(301956, 11)

In [277]:
vyber_acc_trans.head(3)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,prev_balance,k_symbol,bank,account
112,292053,1000,1993-01-25,PRIJEM,VKLAD,200.0,200.0,,,,
247,292055,1000,1993-02-07,PRIJEM,PREVOD Z UCTU,31348.0,31548.0,200.0,,ST,87163465.0
682,292056,1000,1993-03-07,PRIJEM,PREVOD Z UCTU,31348.0,61418.3,31548.0,,ST,87163465.0


In [278]:
# Создание серий с trans_id по критерию консистентности баланса для последующей смены либо на VYDAJ либо на unknown
# То есть если type равно VYBER и текущий баланс отличается от баланса предыдущей транзакции на amount в отрицательную сторону,
# то записываем VYDAJ вместо VYBER либо unknown если баланс непоследовательный

# создаем датафрейм где type равняется VYBER
vyber_trans = vyber_acc_trans[vyber_acc_trans.type == 'VYBER']

# создаем булеву маску где текущий баланс некосистентен с балансом предыдущей транзакции
inconsistent_vyber_mask = vyber_trans.prev_balance != vyber_trans.balance + vyber_trans.amount

# применяем булеву маску для неконсистентых записей, которые попадут в новую категорию unknown
trans_id_type_to_unknown = vyber_trans[inconsistent_vyber_mask].trans_id

# применяем булеву маску для консистентных записей, которые попадут в категорию VYDAJ
trans_id_type_to_vydaj = vyber_trans[~inconsistent_vyber_mask].trans_id

# сверяем количество записей после изменений с vyber_trans. при сложении должно быть количество равное vyber_trans
to_vydaj_and_unknown_count = trans_id_type_to_unknown.count() + trans_id_type_to_vydaj.count()
vyber_trans_count = vyber_trans.trans_id.count()

In [279]:
if to_vydaj_and_unknown_count != vyber_trans_count:
    raise ValueError(f'''Rows count is not equal after changes!
vyber_trans count is {vyber_trans_count} while to_vydaj + to_unknown count is {to_vydaj_and_unknown_count}''')
else:
    print('vyber_trans count is equal to to_vydaj + to_unknown count. Everything is OK.')

# Посмотрим сколько значений попадают под смену на VYDAJ
print(f'\nTransactions count whose type to be changed to VYDAJ: {trans_id_type_to_vydaj.count()}')

vyber_trans count is equal to to_vydaj + to_unknown count. Everything is OK.

Transactions count whose type to be changed to VYDAJ: 168


In [280]:
# Меняем VYBER в колонке type соответственно, делая булевы маски по отобранным транзакциям

transaction_df_copy.loc[transaction_df_copy.trans_id.isin(trans_id_type_to_vydaj), 'type'] = 'VYDAJ'
transaction_df_copy.loc[transaction_df_copy.trans_id.isin(trans_id_type_to_unknown), 'type'] = 'unknown'

# смотрим результат: какие теперь значения в колонке type
transaction_df_copy.type.unique()

array(['PRIJEM', 'VYDAJ', 'unknown'], dtype=object)

In [281]:
# продолжаем дальше знакомиться с данными в transactions
# посмотрим какие пары type и operation есть при условии что bank, k_symbol, account пустые

transaction_df_copy.loc[transaction_df_copy[['bank', 'k_symbol', 'account']].isna().all(axis=1), ['type','operation']].value_counts()

type     operation
VYDAJ    VYBER        247166
PRIJEM   VKLAD        156743
unknown  VYBER         16498
Name: count, dtype: int64

## Значения в type

"PRIJEM" stands for credit

"VYDAJ" stands for withdrawal

In [282]:
# Меняем значения в колонке type на англоязычные
type_mapping = {"PRIJEM":"credit", "VYDAJ":"withdrawal"}
transaction_df_copy.type = transaction_df_copy.type.replace(type_mapping)

# смотрим какие значения после замены
transaction_df_copy.type.unique()

array(['credit', 'withdrawal', 'unknown'], dtype=object)

In [283]:
transaction_df_copy.head(1)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,1993-01-01,credit,VKLAD,700.0,700.0,,,


In [284]:
# посмотрим есть ли транзакции где operation это PREVOD NA UCET(перевод в другой банк)
# или PREVOD Z UCTU (перевод из другого банка)
# и при этом пустое значение account(счет на который переводили)

from_to_another_acc_mask = transaction_df_copy.operation.isin(['PREVOD NA UCET', 'PREVOD Z UCTU'])
acc_isna_mask = transaction_df_copy['account'].isna()
transaction_df_copy.loc[(from_to_another_acc_mask) & (acc_isna_mask)]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
304834,236564,808,1996-04-12,withdrawal,PREVOD NA UCET,4500.0,4140.6,UVER,,


"PREVOD Z UCTU" collection from another bank

"VYBER" withdrawal in cash

"PREVOD NA UCET" remittance to another bank

In [285]:
# посмотрим теперь где operation это PREVOD NA UCET(перевод в другой банк)
# или PREVOD Z UCTU (перевод из другого банка), но при этом пустое значение bank

bank_isna_mask = transaction_df_copy['bank'].isna()
transaction_df_copy.loc[(from_to_another_acc_mask) & (bank_isna_mask)]

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
304834,236564,808,1996-04-12,withdrawal,PREVOD NA UCET,4500.0,4140.6,UVER,,


In [286]:
transaction_df_copy.isna().sum()

trans_id           0
account_id         0
date               0
type               0
operation     183114
amount             0
balance            0
k_symbol      481881
bank          782812
account       760931
dtype: int64

In [287]:
# проверка комбинаций значений: k_symbol, bank, account, где в строке есть operation == NaN

k_sym_bank_or_acc_notna = transaction_df_copy[['k_symbol','bank','account']].notna().any(axis=1)
op_isna_mask = transaction_df_copy['operation'].isna()
transaction_df_copy.loc[(k_sym_bank_or_acc_notna) & (op_isna_mask),['operation','k_symbol','bank','account']].drop_duplicates()

Unnamed: 0,operation,k_symbol,bank,account
137,,UROK,,


## Значения в operation

"VYBER KARTOU" credit card withdrawal

"VKLAD" credit in cash

"PREVOD Z UCTU" collection from another bank

"VYBER" withdrawal in cash

"PREVOD NA UCET" remittance to another bank

In [288]:
# Смена значений колонки operation на англоязычные

operation_mapping = {"VYBER KARTOU":"credit card withdrawal", "VKLAD":"credit in cash", \
                    "PREVOD Z UCTU":"collection from another bank", "VYBER":"withdrawal in cash", \
                    "PREVOD NA UCET":"remittance to another bank"}
transaction_df_copy.operation = transaction_df_copy.operation.replace(operation_mapping).fillna('unspecified')

In [289]:
transaction_df_copy.operation.unique()

array(['credit in cash', 'collection from another bank',
       'withdrawal in cash', 'unspecified', 'remittance to another bank',
       'credit card withdrawal'], dtype=object)

## Значения в k_symbol
"POJISTNE" stands for insurrance payment

"SLUZBY" stands for payment for statement

"UROK" stands for interest credited

"SANKC. UROK" sanction interest if negative balance

"SIPO" stands for household

"DUCHOD" stands for old-age pension

"UVER" stands for loan payment

In [290]:
# смотрим что есть в k_symbol
transaction_df_copy.k_symbol.str.strip().unique()

array([nan, 'DUCHOD', 'UROK', 'SIPO', 'SLUZBY', '', 'POJISTNE',
       'SANKC. UROK', 'UVER'], dtype=object)

In [291]:
# меняем значения в k_symbol на анлийский. Пустые строки обозначаем 'unspecified'. NaN-ы не трогаем

k_sym_mapping = {"POJISTNE":"insurance payment", "SLUZBY":"payment for statement", "UROK":"interest credited", \
                "SANKC. UROK": "negative balance sanction interest", "SIPO":"household", \
                 "DUCHOD":"old-age pension", "UVER":"loan payment", "":"unspecified"}

transaction_df_copy.k_symbol = transaction_df_copy.k_symbol.str.strip().replace(k_sym_mapping)
transaction_df_copy.k_symbol.unique()

array([nan, 'old-age pension', 'interest credited', 'household',
       'payment for statement', 'unspecified', 'insurance payment',
       'negative balance sanction interest', 'loan payment'], dtype=object)

In [292]:
# проверка значений в bank

transaction_df_copy.bank.unique()

array([nan, 'YZ', 'IJ', 'ST', 'UV', 'MN', 'OP', 'AB', 'CD', 'WX', 'GH',
       'EF', 'QR', 'KL'], dtype=object)

In [293]:
# проверяем датафрейм на целостность после изменений. Должны совпадать и строки и колонки

check_df_integrity(trans_initial_shape, transaction_df_copy, 'transaction_df_copy')

transaction_df_copy is intact!


In [294]:
transaction_df_copy.head(3)

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,1993-01-01,credit,credit in cash,700.0,700.0,,,
1,171812,576,1993-01-01,credit,credit in cash,900.0,900.0,,,
2,207264,704,1993-01-01,credit,credit in cash,1000.0,1000.0,,,


In [295]:
# Сохраняем очищенные данные в csv

df_to_csv(transaction_df_copy, 'transaction_df')

Saved cleaned DataFrame to data/cleaned_data/transaction_df_clean.csv


# Таблица Loan
| item       | meaning                        | remark                                                                                                                                                                                                            |
| ---------- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| loan_id    | record identifier              |                                                                                                                                                                                                                   |
| account_id | identification of the account  |                                                                                                                                                                                                                   |
| date       | date when the loan was granted | in the form YYMMDD                                                                                                                                                                                                |
| amount     | amount of money                |                                                                                                                                                                                                                   |
| duration   | duration of the loan           |                                                                                                                                                                                                                   |
| payments   | monthly payments               |                                                                                                                                                                                                                   |
| status     | status of paying off the loan  | 'A' stands for contract finished, no problems,<br><br>'B' stands for contract finished, loan not paid,<br><br>'C' stands for running contract, OK so far,<br><br>'D' stands for running contract, client in debt |

In [296]:
loan_df = pd.read_csv('./data/raw_data/loan.asc', sep=';', dtype={'loan_id':'str', 'account_id':'str', 'amount':'float64'})

In [297]:
loan_initial_shape = loan_df.shape
loan_initial_shape

(682, 7)

In [298]:
loan_df.date = pd.to_datetime(loan_df.date, format='%y%m%d')

In [299]:
loan_df.date.dt.year.unique()

array([1993, 1994, 1995, 1996, 1997, 1998], dtype=int32)

In [300]:
loan_df.dtypes

loan_id               object
account_id            object
date          datetime64[ns]
amount               float64
duration               int64
payments             float64
status                object
dtype: object

In [301]:
loan_df.isna().sum()

loan_id       0
account_id    0
date          0
amount        0
duration      0
payments      0
status        0
dtype: int64

In [302]:
loan_df.head()

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,1993-07-05,96396.0,12,8033.0,B
1,5316,1801,1993-07-11,165960.0,36,4610.0,A
2,6863,9188,1993-07-28,127080.0,60,2118.0,A
3,5325,1843,1993-08-03,105804.0,36,2939.0,A
4,7240,11013,1993-09-06,274740.0,60,4579.0,A


'A' stands for contract finished, no problems,

'B' stands for contract finished, loan not paid,

'C' stands for running contract, OK so far,

'D' stands for running contract, client in debt

In [303]:
# меняем буквенные обозначения в status на подробные
status_mapping = {'A':'contract finished, no problems', 'B':'contract finished, loan not paid', \
                 'C': 'running contract, OK so far', 'D': 'running contract, client in debt'}

loan_df.status = loan_df.status.replace(status_mapping)
loan_df.status.unique()

array(['contract finished, loan not paid',
       'contract finished, no problems', 'running contract, OK so far',
       'running contract, client in debt'], dtype=object)

In [304]:
loan_df.describe()

Unnamed: 0,date,amount,duration,payments
count,682,682.0,682.0,682.0
mean,1996-09-29 05:35:43.108504448,151410.175953,36.492669,4190.664223
min,1993-07-05 00:00:00,4980.0,12.0,304.0
25%,1995-07-04 12:00:00,66732.0,24.0,2477.0
50%,1997-02-06 12:00:00,116928.0,36.0,3934.0
75%,1997-12-12 12:00:00,210654.0,48.0,5813.5
max,1998-12-08 00:00:00,590820.0,60.0,9910.0
std,,113372.40631,17.075219,2215.830344


In [305]:
# целостность датафрейма после изменений

check_df_integrity(loan_initial_shape, loan_df, 'loan_df')

loan_df is intact!


In [306]:
# Сохраняем очищенные данные в csv

df_to_csv(loan_df, 'loan_df')

Saved cleaned DataFrame to data/cleaned_data/loan_df_clean.csv


# Таблица Credit Card

| item    | meaning                   | remark                                          |
| ------- | ------------------------- | ----------------------------------------------- |
| card_id | record identifier         |                                                 |
| disp_id | disposition to an account |                                                 |
| type    | type of card              | possible values are "junior", "classic", "gold" |
| issued  | issue date                | in the form YYMMDD                              |

In [307]:
card_df = pd.read_csv('./data/raw_data/card.asc', sep=';', dtype={'card_id':'str', 'disp_id':'str'})

In [308]:
card_initial_shape = card_df.shape
card_initial_shape

(892, 4)

In [309]:
card_df.dtypes

card_id    object
disp_id    object
type       object
issued     object
dtype: object

In [310]:
card_df.isna().sum()

card_id    0
disp_id    0
type       0
issued     0
dtype: int64

In [311]:
card_df.head()

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,classic,931107 00:00:00
1,104,588,classic,940119 00:00:00
2,747,4915,classic,940205 00:00:00
3,70,439,classic,940208 00:00:00
4,577,3687,classic,940215 00:00:00


In [312]:
card_df.issued = pd.to_datetime(card_df.issued, format='%y%m%d %H:%M:%S')
card_df.issued.dt.year.unique()

array([1993, 1994, 1995, 1996, 1997, 1998], dtype=int32)

In [313]:
card_df.head(3)

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,classic,1993-11-07
1,104,588,classic,1994-01-19
2,747,4915,classic,1994-02-05


In [314]:
# проверка на целостность

check_df_integrity(card_initial_shape, card_df, 'card_df')

card_df is intact!


In [315]:
# Сохраняем очищенные данные в csv

df_to_csv(card_df, 'card_df')

Saved cleaned DataFrame to data/cleaned_data/card_df_clean.csv


# Таблица Demographic data (District)
| item             | meaning                                          | remark |
| ---------------- | ------------------------------------------------ | ------ |
| A1 = district_id | district code                                    |        |
| A2               | district name                                    |        |
| A3               | region                                           |        |
| A4               | no. of inhabitants                               |        |
| A5               | no. of municipalities with inhabitants < 499     |        |
| A6               | no. of municipalities with inhabitants 500-1999  |        |
| A7               | no. of municipalities with inhabitants 2000-9999 |        |
| A8               | no. of municipalities with inhabitants >10000    |        |
| A9               | no. of cities                                    |        |
| A10              | ratio of urban inhabitants                       |        |
| A11              | average salary                                   |        |
| A12              | unemploymant rate '95                            |        |
| A13              | unemploymant rate '96                            |        |
| A14              | no. of enterpreneurs per 1000 inhabitants        |        |
| A15              | no. of commited crimes '95                       |        |
| A16              | no. of commited crimes '96                       |        |

In [316]:
district_df = pd.read_csv("./data/raw_data/district.asc", sep=';')

In [317]:
district_initial_shape = district_df.shape
district_initial_shape

(77, 16)

In [318]:
district_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


In [319]:
# строка со значениями колонок по порядку

district_col_raw = ''' district code                                    |
| district name                                    |
| region                                           |
| population                               |
| no_of_mun_below_500     |
| no of mun_between_500_1999  |
| no of mun_between_2000_9999 |
| no of mun_above_10000    |
| no. of cities                                    |
| ratio of urban population                       |
| avg_salary                                   |
| unemployment rate '95                            |
| unemployment rate '96                            |
| enterpreneurs per_1000      |
| crimes num '95                       |
| crimes num '96                       '''

In [320]:
# разбиваем строку по символу |
district_col_split = district_col_raw.split('|')

In [321]:
district_col_split

[' district code                                    ',
 '\n',
 ' district name                                    ',
 '\n',
 ' region                                           ',
 '\n',
 ' population                               ',
 '\n',
 ' no_of_mun_below_500     ',
 '\n',
 ' no of mun_between_500_1999  ',
 '\n',
 ' no of mun_between_2000_9999 ',
 '\n',
 ' no of mun_above_10000    ',
 '\n',
 ' no. of cities                                    ',
 '\n',
 ' ratio of urban population                       ',
 '\n',
 ' avg_salary                                   ',
 '\n',
 " unemployment rate '95                            ",
 '\n',
 " unemployment rate '96                            ",
 '\n',
 ' enterpreneurs per_1000      ',
 '\n',
 " crimes num '95                       ",
 '\n',
 " crimes num '96                       "]

In [322]:
# убираем пробелы с начала и конца каждой строки
# меняем пробелы на _
# удаляем апострофы
# удаляем точки

dist_col_series = pd.Series(district_col_split).str.strip() \
.str.replace(' ','_') \
.str.replace('\'', '') \
.str.replace('.', '')

# Фильтруем серию, чтобы избавиться от пустых строк. Сбрасываем индекс
dist_col_clean = dist_col_series[dist_col_series != ''].reset_index(drop=True)
dist_col_clean

0                   district_code
1                   district_name
2                          region
3                      population
4             no_of_mun_below_500
5      no_of_mun_between_500_1999
6     no_of_mun_between_2000_9999
7           no_of_mun_above_10000
8                    no_of_cities
9       ratio_of_urban_population
10                     avg_salary
11           unemployment_rate_95
12           unemployment_rate_96
13         enterpreneurs_per_1000
14                  crimes_num_95
15                  crimes_num_96
dtype: object

In [323]:
# назначаем district_df очищенные названия колонок

district_df.columns = dist_col_clean
district_df.head()

Unnamed: 0,district_code,district_name,region,population,no_of_mun_below_500,no_of_mun_between_500_1999,no_of_mun_between_2000_9999,no_of_mun_above_10000,no_of_cities,ratio_of_urban_population,avg_salary,unemployment_rate_95,unemployment_rate_96,enterpreneurs_per_1000,crimes_num_95,crimes_num_96
0,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.29,0.43,167,85677,99107
1,2,Benesov,central Bohemia,88884,80,26,6,2,5,46.7,8507,1.67,1.85,132,2159,2674
2,3,Beroun,central Bohemia,75232,55,26,4,1,5,41.7,8980,1.95,2.21,111,2824,2813
3,4,Kladno,central Bohemia,149893,63,29,6,2,6,67.4,9753,4.64,5.05,109,5244,5892
4,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.85,4.43,118,2616,3040


In [324]:
district_df.dtypes

district_code                    int64
district_name                   object
region                          object
population                       int64
no_of_mun_below_500              int64
no_of_mun_between_500_1999       int64
no_of_mun_between_2000_9999      int64
no_of_mun_above_10000            int64
no_of_cities                     int64
ratio_of_urban_population      float64
avg_salary                       int64
unemployment_rate_95            object
unemployment_rate_96           float64
enterpreneurs_per_1000           int64
crimes_num_95                   object
crimes_num_96                    int64
dtype: object

In [325]:
# Замена значений '?' в unemployment_rate_95 на NaN, и приведение к float64

unemp95_mapping = {'?':np.nan}
district_df.unemployment_rate_95 = district_df.unemployment_rate_95.replace(unemp95_mapping).astype('float64')
district_df.unemployment_rate_95.unique()

array([0.29, 1.67, 1.95, 4.64, 3.85, 2.95, 2.26, 1.25, 3.39, 0.56, 0.45,
       3.83, 2.77, 1.42, 3.13, 1.12, 2.38, 2.83, 2.65, 1.51, 1.1 , 1.79,
       1.39, 2.47, 2.64, 0.65, 1.62, 2.82, 3.38, 3.52, 2.8 , 5.75, 6.43,
       1.02, 3.33, 4.46, 7.08, 7.34, 6.49, 3.32, 2.41, 1.72, 2.79, 2.28,
       1.78, 1.89, 4.83, 2.51, 2.52, 2.53, 1.6 , 1.88, 4.69, 3.73, 3.24,
       3.45, 4.76, 1.29, 3.79, 5.74, 3.51, 5.77, 4.09,  nan, 6.63, 5.93,
       3.8 , 4.75, 5.38, 4.73, 4.01])

In [326]:
# Замена значений '?' в crimes_num_95 на NaN, и приведение к float64

crime95_mapping = {'?':pd.NA}
district_df.crimes_num_95 = district_df.crimes_num_95.replace(crime95_mapping).astype('Int64')
district_df.crimes_num_95.unique()

<IntegerArray>
[85677,  2159,  2824,  5244,  2616,  2640,  4289,  5179,  2987,  3810,  3475,
  3804,  1597,  6604,  1845,  1874,  1003,  1740,   999,  1563,  2299,  1089,
  2879,  5198,  1822,  6041,  1029,  1580,   818,  2985,  1328,  4340,  4650,
  5323,  3384,  5796,  4147,  2653,  4947,  6949,  6445,  1658,  4085,  2166,
  2080,  2854,  6079,  1655,  1660,  2123,  3496,  2564,  1850, 18721,  3659,
  3729,  2212,  2595,  1879,  2112,  2719,  1562,  4484,  2157,  2247,  3244,
  5623,  <NA>,  9878,  4980,  9672,  4355, 18782,  4063,  3736,  3460]
Length: 76, dtype: Int64

In [327]:
district_df.isna().sum()

district_code                  0
district_name                  0
region                         0
population                     0
no_of_mun_below_500            0
no_of_mun_between_500_1999     0
no_of_mun_between_2000_9999    0
no_of_mun_above_10000          0
no_of_cities                   0
ratio_of_urban_population      0
avg_salary                     0
unemployment_rate_95           1
unemployment_rate_96           0
enterpreneurs_per_1000         0
crimes_num_95                  1
crimes_num_96                  0
dtype: int64

In [328]:
# проверка целостности

check_df_integrity(district_initial_shape, district_df, 'district_df')

district_df is intact!


In [329]:
# Сохраняем очищенные данные в csv

df_to_csv(district_df, 'district_df')

Saved cleaned DataFrame to data/cleaned_data/district_df_clean.csv


In [333]:
# В конце сверимся с количеством оригинальных файлов с данными и файлов с очищенными данными

raw_count = len(os.listdir('data/raw_data'))
cleaned_count = len(os.listdir('data/cleaned_data'))

if raw_count != cleaned_count:
    raise ValueError(f'''Cleaned files count differs from raw files count!
Raw files: {raw_count}
Cleaned files: {cleaned_count}
''')

else:
    print('Raw and cleaned files count is equal.')

Raw and cleaned files count is equal.
