In [1]:
import pandas as pd
import xlrd
import os

In [2]:
# Function to handle mixed separators in the 'valor' column
def correct_number_formatting(s):
    if ',' in s and s.count('.') > 1:  # Likely European format with '.' as thousand separator
        return s.replace('.', '').replace(',', '.')
    elif ',' in s and s.count('.') == 1:  # Single '.' before ',' is likely a thousand separator
        return s.replace('.', '').replace(',', '.')
    elif ',' in s and s.count('.') == 0:  # No '.' before ',' is likely a decimal separator
        return s.replace(',', '.')  # Replace ',' with '.' for decimal separator
    return s  # No need to change if only one '.' without ',' or proper format already

In [30]:
def combine_data_frames(data_frames):
    # Filter out empty DataFrames
    non_empty_data_frames = [df for df in data_frames.values() if not df.empty]
    
    # Combine the non-empty DataFrames
    combined_df = pd.concat(non_empty_data_frames, ignore_index=True) if non_empty_data_frames else pd.DataFrame()
    return combined_df

#Function Itau
def read_excel_file_itau_statement(file_path):
    # Open the workbook
    wb = xlrd.open_workbook(file_path)
    ws = wb.sheet_by_index(0)

    # Define the titles to search for
    target_titles = [
        "Lançamentos"
    ]

    data_frames = {}

    # Check each row for potential titles
    for i in range(ws.nrows):
        row = ws.row(i)
        if row[0].ctype == xlrd.XL_CELL_TEXT and row[0].value.strip():  # Check if cell contains text
            title = row[0].value.strip()
            if title in target_titles:
                data = []

                # Assume table starts 2 rows below title
                table_start = i + 2

                # Read data until an empty row or specific condition
                for j in range(table_start, ws.nrows):
                    data_row = ws.row(j)
                    if data_row[0].value == xlrd.empty_cell.value:  # Check for empty cell indicating end of table
                        break
                    data.append([cell.value for cell in data_row])
                
                # Create DataFrame and use the first row of data as headers
                df = pd.DataFrame(data[1:], columns=data[0])  # Skip the first row for data and use it as column headers
                df['origem'] = "itau"
                df.drop(columns=['ag./origem', 'saldos (R$)'], inplace=True)
                df.rename(columns={'valor (R$)': 'valor'}, inplace=True)
                df = df[df['valor'] != '']
                df = df[df['valor'] < 0]
                data_frames[title] = df  # Store the DataFrame with its title
                combined_df_itau_statement = combine_data_frames(data_frames)
                combined_df_itau_statement['tipo'] = 'debit'
                combined_df_itau_statement['valor'] = combined_df_itau_statement['valor'].astype(str).apply(lambda x: correct_number_formatting(x)).astype(float)
                combined_df_itau_statement = combined_df_itau_statement[~combined_df_itau_statement['lançamento'].str.contains('JOAO MA', na=False, case=False, regex=True)]
                combined_df_itau_statement = combined_df_itau_statement[~combined_df_itau_statement['lançamento'].str.contains('3034', na=False, case=False, regex=True)]
                combined_df_itau_statement['data'] = pd.to_datetime(combined_df_itau_statement['data'], errors='coerce', dayfirst=True)
                first_row_month = combined_df_itau_statement['data'].dt.month.iloc[0]
                combined_df_itau_statement = combined_df_itau_statement[combined_df_itau_statement['data'].dt.month == first_row_month]
                combined_df_itau_statement.rename(columns={'data': 'date', 'lançamento': 'description', 'valor': 'amount', 'origem': 'account_name', 'tipo': 'transaction_type'}, inplace=True)
                combined_df_itau_statement['id'] = combined_df_itau_statement.reset_index().index



    return combined_df_itau_statement

#Function Santander
def read_excel_file_santander_statement(file_path):
    # Open the workbook
    wb = xlrd.open_workbook(file_path)
    ws = wb.sheet_by_index(0)

    # Define the titles to search for
    target_titles = [
        "Tipo de Lancamento: Todos"
    ]

    data_frames = {}

    # Check each row for potential titles
    for i in range(ws.nrows):
        row = ws.row(i)
        if row[0].ctype == xlrd.XL_CELL_TEXT and row[0].value.strip():  # Check if cell contains text
            title = row[0].value.strip()
            if title in target_titles:
                data = []

                # Assume table starts 2 rows below title
                table_start = i + 1

                # Read data until an empty row or specific condition
                for j in range(table_start, ws.nrows):
                    data_row = ws.row(j)
                    if data_row[0].value == xlrd.empty_cell.value:  # Check for empty cell indicating end of table
                        break
                    data.append([cell.value for cell in data_row])
                
                # Create DataFrame and use the first row of data as headers
                df = pd.DataFrame(data[1:], columns=data[0])  # Skip the first row for data and use it as column headers
                df['origem'] = "santander"
                df.drop(columns=['Docto ', 'Situação ', 'Crédito (R$) ', 'Saldo (R$) '], inplace=True)
                df.rename(columns={'Data ':'data', 'Descrição ':'lançamento', 'Débito (R$) ':'valor',}, inplace=True)
                df = df[df['valor'] != '']
                df = df[df['lançamento'] != 'SALDO ANTERIOR ']
                df = df[df['data'] != 'TOTAL ']
                data_frames[title] = df  # Store the DataFrame with its title
                combined_df_santander_statement = combine_data_frames(data_frames)
                combined_df_santander_statement['tipo'] = 'debit'
                combined_df_santander_statement['valor'] = combined_df_santander_statement['valor'].astype(str).apply(lambda x: correct_number_formatting(x)).astype(float)
                combined_df_santander_statement = combined_df_santander_statement[combined_df_santander_statement['valor'] < 0]
                combined_df_santander_statement = combined_df_santander_statement[~combined_df_santander_statement['lançamento'].str.contains('JOAO MA', na=False, case=False, regex=True)]
                combined_df_santander_statement = combined_df_santander_statement[~combined_df_santander_statement['lançamento'].str.contains('3034', na=False, case=False, regex=True)]
                combined_df_santander_statement['data'] = pd.to_datetime(combined_df_santander_statement['data'], errors='coerce', dayfirst=True)
                first_row_month = combined_df_santander_statement['data'].dt.month.iloc[0]
                combined_df_santander_statement = combined_df_santander_statement[combined_df_santander_statement['data'].dt.month == first_row_month]
                combined_df_santander_statement.rename(columns={'data': 'date', 'lançamento': 'description', 'valor': 'amount', 'origem': 'account_name', 'tipo': 'transaction_type'}, inplace=True)
                combined_df_santander_statement['id'] = combined_df_santander_statement.reset_index().index

    return combined_df_santander_statement

#Function Bradesco
def read_excel_file_bradesco_statement(file_path):
    data_frame = pd.read_csv(file_path, sep=';', encoding='1252', header=0, skiprows=1, skipfooter=0, engine='python')
    data_frame.drop(columns=['Docto.','Crédito (R$)','Saldo (R$)','Unnamed: 6'], inplace=True)
    data_frame.rename(columns={'Data': 'data', 'Histórico': 'lançamento', 'Débito (R$)': 'valor'}, inplace=True)
    #data_frame = data_frame[data_frame['valor'] != '']
    data_frame['origem'] = "bradesco"
    # Check for the first occurrence of 'Os dados acima' and slice the DataFrame
    cut_off_index = data_frame[data_frame['data'].str.contains('Os dados acima', na=False)].index.min()
    if pd.notna(cut_off_index):
        data_frame = data_frame.loc[:cut_off_index - 2]
    # Find indices where 'data' column is NaN
    nan_indices = data_frame[data_frame['data'].isna()].index

    # Loop through the indices of NaN values in 'data'
    for idx in nan_indices:
        if idx > 0:  # Ensure there is a previous index
            # Concatenate 'lançamento' values at i-1 and i
            concatenated_value = data_frame.at[idx - 1, 'lançamento'] + ' ' + data_frame.at[idx, 'lançamento']
            # Store the concatenated value back at i-1
            data_frame.at[idx - 1, 'lançamento'] = concatenated_value

    # Remove rows where 'data' is NaN
    data_frame.dropna(subset=['data','valor'], inplace=True)
    data_frame['tipo'] = 'debit'
    data_frame['valor'] = data_frame['valor'].astype(str).apply(lambda x: correct_number_formatting(x)).astype(float)
    data_frame = data_frame[data_frame['valor'] < 0]
    data_frame['data'] = data_frame['data'].str.replace('/24', '/2024').str.strip()
    data_frame = data_frame[~data_frame['lançamento'].str.contains('JOAO MA', na=False, case=False, regex=True)]
    data_frame = data_frame[~data_frame['lançamento'].str.contains('3034', na=False, case=False, regex=True)]
    data_frame['data'] = pd.to_datetime(data_frame['data'], errors='coerce', dayfirst=True)
    first_row_month = data_frame['data'].dt.month.iloc[0]
    data_frame = data_frame[data_frame['data'].dt.month == first_row_month]
    data_frame.rename(columns={'data': 'date', 'lançamento': 'description', 'valor': 'amount', 'origem': 'account_name', 'tipo': 'transaction_type'}, inplace=True)
    data_frame['id'] = data_frame.reset_index().index
    return data_frame

#Function Nubank
def read_excel_file_nubank_statement(file_path):
    data_frame = pd.read_csv(file_path, sep=',', encoding='utf-8', header=0, skiprows=0, skipfooter=0, engine='python')
    data_frame.rename(columns={'Data': 'data', 'Descrição': 'lançamento', 'Valor': 'valor'}, inplace=True)
    data_frame.drop(columns=['Identificador'], inplace=True)
    data_frame = data_frame[data_frame['valor'] < 0]
    data_frame['origem'] = "nubank"
    data_frame = data_frame[['data', 'lançamento', 'valor', 'origem']]
    data_frame['tipo'] = 'debit'
    data_frame = data_frame[data_frame['valor'] < 0]
    data_frame['valor'] = data_frame['valor'].astype(str).apply(lambda x: correct_number_formatting(x)).astype(float)
    data_frame = data_frame[~data_frame['lançamento'].str.contains('JOAO MA', na=False, case=False, regex=True)]
    data_frame = data_frame[~data_frame['lançamento'].str.contains('3034', na=False, case=False, regex=True)]
    data_frame['data'] = pd.to_datetime(data_frame['data'], errors='coerce', dayfirst=True)
    first_row_month = data_frame['data'].dt.month.iloc[0]
    data_frame = data_frame[data_frame['data'].dt.month == first_row_month]
    data_frame.rename(columns={'data': 'date', 'lançamento': 'description', 'valor': 'amount', 'origem': 'account_name', 'tipo': 'transaction_type'}, inplace=True)
    data_frame['id'] = data_frame.reset_index().index
    return data_frame

def read_excel_file_statement(file_path):
    card_type = os.path.basename(file_path).split('-')[0].lower()
    if card_type == 'itau':
        return read_excel_file_itau_statement(file_path), card_type
    elif card_type == 'santander':
        return read_excel_file_santander_statement(file_path), card_type
    elif card_type == 'bradesco':
        return read_excel_file_bradesco_statement(file_path), card_type
    elif card_type == 'nubank':
        return read_excel_file_nubank_statement(file_path), card_type
    else:
        raise ValueError("Unknown card type found in file name.")

def process_files(file_paths):
    for file_path in file_paths:
        data_frame, card_type = read_excel_file_statement(file_path)
        # Define the filename using the card_type
        filename = f"data/extratos/{card_type}_statement.csv"
        # Save the DataFrame to a CSV file
        data_frame.to_csv(filename, index=False)
        print(f"Saved {filename}")

In [29]:
# Usage
file_path = 'data/extratos/itau-04-2024.xls'
data_frames = read_excel_file_itau_statement(file_path)
data_frames

Unnamed: 0,date,description,amount,account_name,transaction_type,id
0,2024-04-02,PIX TRANSF L A FER02/04,-700.0,itau,debit,0
2,2024-04-02,TAR PACOTE ITAU MAR/24,-79.0,itau,debit,1
3,2024-04-04,PIX QRS ITAU UNIBAN04/04,-2886.33,itau,debit,2
4,2024-04-08,CARTAO TUDOAZUL MC,-4299.26,itau,debit,3
5,2024-04-08,DA ALGAR TELECOM 02123,-164.78,itau,debit,4
6,2024-04-08,ITAU MC PAO DE AC,-591.21,itau,debit,5
7,2024-04-08,ITAU MC 4903-9245,-4005.17,itau,debit,6
8,2024-04-10,PIX TRANSF PAULO H10/04,-750.0,itau,debit,7
9,2024-04-10,ITAU VISA 0603-0154,-1792.89,itau,debit,8
10,2024-04-12,PIX TRANSF FORCA T12/04,-180.0,itau,debit,9


In [18]:
# Usage
file_path = 'data/extratos/nubank-04-2024.csv'
data_frames = read_excel_file_nubank_statement(file_path)
data_frames

Unnamed: 0,data,lançamento,valor,origem,tipo
0,2024-04-03,Transferência enviada pelo Pix - Patrícia Magn...,-200.0,nubank,debit
1,2024-04-04,Pagamento de fatura,-7523.62,nubank,debit
3,2024-04-06,Transferência enviada pelo Pix - CONDOMINIO SA...,-40.0,nubank,debit


In [19]:
# Usage
file_path = 'data/extratos/bradesco-04-2024.csv'
data_frames = read_excel_file_bradesco_statement(file_path)
data_frames

Unnamed: 0,data,lançamento,valor,origem,tipo
2,2024-04-04,Pix Qrcode Est Des: Cartão de Crédito 04/04,-989.09,bradesco,debit
4,2024-04-04,Pix Qrcode Est Des: Mercadopago.com Repre 04/04,-94.71,bradesco,debit
7,2024-04-09,Transfe Pix Des: Patricia Magnino Fran 09/04,-900.0,bradesco,debit
10,2024-04-15,Transfe Pix Des: Patricia Magnino Fran 15/04,-100.0,bradesco,debit


In [21]:
# Usage
file_path = 'data/extratos/santander-04-2024.xls'
data_frames = read_excel_file_santander_statement(file_path)
data_frames

Unnamed: 0,data,lançamento,valor,origem,tipo
1,2024-04-10,PAGAMENTO DE BOLETO OUTROS BANCOS IVAN NEGOCI...,-1941.59,santander,debit
2,2024-04-10,PIX ENVIADO Patricia Ma...,-80.0,santander,debit
3,2024-04-10,PIX ENVIADO Patricia Ma...,-640.0,santander,debit


In [31]:
# Usage
file_paths = [
    'data/extratos/itau-04-2024.xls',
    'data/extratos/santander-04-2024.xls',
    'data/extratos/bradesco-04-2024.csv',
    'data/extratos/nubank-04-2024.csv'
]
process_files(file_paths)

Saved data/extratos/itau_statement.csv
Saved data/extratos/santander_statement.csv
Saved data/extratos/bradesco_statement.csv
Saved data/extratos/nubank_statement.csv
