In [1]:
import re
import json
import logging
import traceback
import numpy as np
import pandas as pd

import orjson

#import dask.dataframe as dd
#from dask.distributed import Client

In [2]:
logging.basicConfig(
    filename='../logs/app.log',
    filemode='w',
    format='%(asctime)s - %(levelname)s - %(message)s')

logger = logging.getLogger('data_analysis')

### Load

In [2]:
path = '../data/order_table_202208081749.csv'
df = pd.read_csv(path, sep=';', parse_dates=['created_at_irpf', 'created_at_loan'])

In [3]:
path = '../data/Dicionario_Grafia_Banco_SRF-v14.sav'
bank_df = pd.read_spss(path)

In [4]:
path = '../data/Dicionario_de_Agencias_e_Postos_Bancarios_com_Apelidos_v28.sav'
branch_df = pd.read_spss(path)

In [5]:
df.head()

Unnamed: 0,person_id,loan_id,irpf_id,created_at_irpf,created_at_loan,safra_created,product_code,state,rev,value,ordem
0,1792740,12811556.0,32876504,2022-03-22,2022-03-22,202203,PERSONAL,3.0,9,"{""rev"":9,""objType"":""IrpfPersonInfoT"",""personId...",1
1,15057615,,33004278,2022-05-13,NaT,202205,,,10,"{""rev"":10,""objType"":""IrpfPersonInfoT"",""personI...",1
2,14936007,12813121.0,32879551,2022-03-24,2022-03-24,202203,PERSONAL,2.0,10,"{""rev"":10,""objType"":""IrpfPersonInfoT"",""personI...",1
3,2667826,12812575.0,32878530,2022-03-23,2022-03-23,202203,PERSONAL,3.0,6,"{""rev"":6,""objType"":""IrpfPersonInfoT"",""personId...",1
4,15273265,12841289.0,32958660,2022-04-30,2022-04-30,202204,PERSONAL,5.0,8,"{""rev"":8,""objType"":""IrpfPersonInfoT"",""personId...",1


In [7]:
branch_df.groupby(['Bank', 'Branch'])['BankName'].count().rename('count').reset_index().sort_values(by='count', ascending=False)

Unnamed: 0,Bank,Branch,count
0,001,0000,1
26930,275,0704,1
26923,275,0680,1
26924,275,0681,1
26925,275,0682,1
...,...,...,...
13463,041,0897,1
13464,041,0898,1
13465,041,0900,1
13466,041,0901,1


### Transformation

In [8]:
# Pad left side of branch with 0 to be 4 digits string
# cut right side of branch string to keep only 4 first digits in string

# Bank: convert from string name to string code of bank

def get_json_value(data, key='riskInfo'):
    try:
        data = json.loads(data)
    except:
        data = {}
    
    return data.get(key, {})


def get_bank_code(bank_name, df_bank: pd.DataFrame):
    try:
        bank_code = df_bank.loc[
            df_bank['BankName'] == bank_name.upper(),
            'Codigo_Banco'].item()
    except ValueError as ve:
        bank_code = '###'
    
    return bank_code

def tax_to_pay(text):
    if re.search('resultado encontrado: saldo inexistente', text.lower()):
        return 'no_balance'
    elif re.search('resultado encontrado: imposto a pagar', text.lower()):
        return 'tax_to_pay'
    elif (re.search('situação da restituição', text.lower()) or
            re.search('em fila de restituição', text.lower())):
        return 'refund'
    else:
        return 'error'


def get_full_text(risk_dict):
    year_list = list(risk_dict.keys())

    full_msg_dict = {}

    for year in year_list:
        data = risk_dict.get(year, {})

        if data['is_declaration']:
            full_msg_dict[year] = data.get('full_status_text', '')

    return full_msg_dict

def get_full_text_series(risk_dict):
    year_list = list(risk_dict.keys())

    person_lst = []
    year_lst = []
    text_lst = []
    state_lst = []


    for year in year_list:
        data = risk_dict.get(year, {})

        if data.get('is_declaration', False):
            person_lst = data.get('cpf', '')
            year_lst.append(year)
            txt = data.get('full_status_text', '')
            text_lst.append(txt)
            state_lst.append(tax_to_pay(txt.lower()))

    return person_lst, year_lst, text_lst, state_lst

def get_year_data(risk_dict):
    year_list = list(risk_dict.keys())
    
    bank_branch_dict = {}

    for year in year_list:
        data = risk_dict.get(year, {})

        if data.get('is_declaration', False):
            if 'bank' in data:
                bank_branch_dict[year] = {
                                            'cpf': str(data.get('cpf', '')),
                                            'bank': str(
                                                get_bank_code(
                                                    str(data.get('bank', '')),
                                                    bank_df)
                                                ).upper(),
                                            'branch': str(
                                                data.get('branch', '')
                                                )[:4].zfill(4),
                                            'state': tax_to_pay(
                                                data.get('full_status_text', '')
                                            )
                                            
                }
            else:
                bank_branch_dict[year] = {
                                            'cpf': str(data.get('cpf', '')),
                                            'state': tax_to_pay(
                                                data.get('full_status_text', '')
                                            )
                }

    return bank_branch_dict

def set_irpf_columns(s):
    risk_info = get_json_value(s)
    a, b, c, d = get_full_text_series(risk_info)
    
    ds = pd.DataFrame()
    ds['cpf'] = a
    ds['year'] = b
    ds['text'] = c
    ds['state'] = d

    return ds

In [5]:
### general json and dict functions

def get_json_value(df: pd.DataFrame, col: str):
    '''Takes a pandas dataframe and a string column-name.
    Extracts json object from specified column in dataframe.
    Returns original dataframe joined with normalized json as columns.'''

    try:
        df = df.copy()
    except Exception as e:
        logger.debug(str(e))
        raise(e)

    try:
        data = pd.json_normalize(
            df[col].apply(
                orjson.loads), max_level=0)
    except KeyError as e:
        logger.debug(str(e))
        return df
    else:
        col_lst = data.columns.difference(df.columns)
        return df.join(data[col_lst])

def extract_value_dict(data: dict, key: str, default=np.nan):
    '''Function receives dictionary with key string
    and returns value. If default is provided, returns
    default value when key does not exist, otherwise returns nan.'''

    try:
        status = data.get(key, default)
    except AttributeError as e:
        logger.debug(str(e))
        raise(e)

    return status

def map_normalize_dict(df: pd.DataFrame, col: str, map:dict):
    '''Receives Pandas dataframe, column name and
    dictionary containing new column names as keys and dict
    keys as values. Normalizes dict column in dataframe and returns
    original dataframe with new columns.'''

    df = df.copy()

    for new_col_name, dict_key in map.items():
        df.loc[:, new_col_name] = df[col].apply(
            lambda x: extract_value_dict(
                data=x, key=dict_key))
                
    return df.fillna(value=np.nan)

In [6]:
### text functions

def apply_regex_series(series: pd.Series, regex: re.Pattern, handle_nan=True):
    '''Receives Pandas Series and regex and returns a numpy array containing
    1 for every match and 0 for no match. Use handle_nan parameter if you want
    to return 0 when value is nan, otherwise nan is passed to regex.'''

    if handle_nan:
        return (np.where(
            series.str.contains(regex) & series.notna(), 1, 0))
    else:
        return (np.where(
            series.str.contains(regex), 1, 0))

In [40]:
### irpf functions

def explode_dict_col(df: pd.DataFrame, dict_col='riskInfo'):
    '''Explodes risk_dict where each row is a
    tax report year.'''

    df = df.copy()

    df.loc[:, 'tax_report_data'] = (
        df[dict_col].apply(
            lambda x: x.values()))

    df = df.apply(pd.Series.explode).reset_index(drop=True).copy()

    return df

def get_irpf_status(df: pd.DataFrame, text_col: str):
    '''Receives pandas dataframe and column name and applies
    regex to column to generate new columns representing status
    of irpf application. Returns dataframe with new columns.'''

    df = df.copy()

    regex_not_consulted = re.compile(
        r'(?:^\s*$|\bdata\sde\snascimento\sinformada\b'
        r'.*\bestá\sdive|\bnão\scoletado'
        r'|\bocorreu\suma\sinconsistência\s?[.])'
        , re.IGNORECASE)

    regex_not_declared = re.compile(
        r'(?:\bconsta\sapresentação\sde\sdeclaração\sanual'
        r'\sde\sisento\b|\bapresentação\sda\sdeclaração\s'
        r'como\sisento\b|\bdeclaração\sconsta\scomo\sisento\b'
        r'|\bdeclaração\sconsta\scomo\spedido\sde'
        r'\sregularização\b|\bsua\sdeclaração\snão\sconsta'
        r'\sna\sbase\sde\sdados\b|\bainda\snão\sestá\sna'
        r'\sbase\b)', re.IGNORECASE)

    regex_tax_refund = re.compile(
        r'(?:\bsituação\sda\srestituição[:]\screditada\b'
        r'|\bsomente\sserá\spermitida\spor\smeio\sdo\scódigo\sde\sacesso\b'
        r'|\baguardando\sreagendamento\spelo\scontribuinte[.]?'
        r'|\bdevolvida\sà\sreceita\sfederal[,]?\sem\srazão\sdo\snão\sresgate\b'
        r'|\benviada\spara\scrédito\sno\sbanco\b'
        r'|\breagendada\spara\scrédito\sno\sbanco\b'
        r'|\bdados\sda\sliberação\sde\ssua\srestituição\b'
        r'|\bdeclaração\sestá\sna\sbase\sde\sdados\b'
        r'|\bestá\sna\sbase[,]\sutilize\so\sextrato\b'
        r'|\bdeclaração\sjá\sfoi\sprocessada[.]?$'
        r'|\brestituição[:]\saguardando\sdevolução\spelo\sbanco\b)'
        , re.IGNORECASE)
    
    col_list = ['irpf_extraction_error', 'irpf_not_declared', 'irpf_tax_refund']

    df.loc[:, 'irpf_extraction_error'] = apply_regex_series(
        df[text_col], regex_not_consulted, handle_nan=False)
    df.loc[:, 'irpf_not_declared'] = apply_regex_series(
        df[text_col], regex_not_declared)
    df.loc[:, 'irpf_tax_refund'] = apply_regex_series(
        df[text_col], regex_tax_refund)

    df.loc[:, 'irpf_tax_to_pay'] = df[col_list].apply(
        lambda x: 1 not in x.values, axis=1).astype(int)

    return df

In [9]:
cols = ['person_id', 'loan_id', 'irpf_id',
        'created_at_irpf', 'product_code',
        'state', 'rev', 'riskInfo']

col_key_map = {
    'full_status_text': 'full_status_text',
    'bank': 'bank',
    'branch': 'branch'}

df = (
    df.pipe(get_json_value, 'value')[cols]
    .pipe(explode_dict_col)
    .pipe(map_normalize_dict, 'tax_report_data', col_key_map))

df.head()

Unnamed: 0,person_id,loan_id,irpf_id,created_at_irpf,product_code,state,rev,riskInfo,tax_report_data,full_status_text,bank,branch
0,1792740,12811556.0,32876504,2022-03-22,PERSONAL,3.0,9,2012,"{'cpf': '86639277620', 'lot': '002', 'bank': '...",Situação da Restituição: Creditada Caso a rest...,ITAU UNIBANCO S.A.,3751.0
1,1792740,12811556.0,32876504,2022-03-22,PERSONAL,3.0,9,2013,"{'cpf': '86639277620', 'data': [['Sua declaraç...",Sua declaração já foi processada. Resultado en...,,
2,1792740,12811556.0,32876504,2022-03-22,PERSONAL,3.0,9,2015,"{'cpf': '86639277620', 'data': [['Sua declaraç...",Sua declaração já foi processada. Resultado en...,,
3,1792740,12811556.0,32876504,2022-03-22,PERSONAL,3.0,9,2016,"{'cpf': '86639277620', 'data': [['Sua declaraç...",Sua declaração já foi processada. Resultado en...,,
4,1792740,12811556.0,32876504,2022-03-22,PERSONAL,3.0,9,2017,"{'cpf': '86639277620', 'data': [['Sua declaraç...",Sua declaração já foi processada. Resultado en...,,


In [41]:
aux = get_irpf_status(df, 'full_status_text').copy()
aux.shape

(165817, 16)

In [121]:
df.isna().sum()

person_id                0
loan_id              14544
irpf_id                  0
created_at_irpf          0
product_code         14544
state                14544
rev                      0
year                     0
tax_report_data          0
is_declaration           0
full_status_text         0
bank                127004
branch              127119
status                   0
dtype: int64