## Imports

## Load data

In [14]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from datetime import datetime
%matplotlib inline

In [15]:
df = pd.read_csv('despesa_ceaps_2021.csv', sep=',', encoding='latin1')

In [16]:
df.head()

Unnamed: 0,ANO,MES,SENADOR,TIPO_DESPESA,CNPJ_CPF,FORNECEDOR,DOCUMENTO,DATA,DETALHAMENTO,VALOR_REEMBOLSADO,COD_DOCUMENTO
0,2021,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",05.914.650/0001-66,ENERGISA,23489627,21/01/2021,Pagamento de energia elétrica para uso do escr...,7529,2158003
1,2021,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",062.135.728-64,FERNANDO WALDEIR PACINI e ANA LUCIA DA SILVA S...,01/2021,05/01/2021,Despesa com pagamento de aluguel de imóvel par...,1000,2157367
2,2021,1,ACIR GURGACZ,"Aluguel de imóveis para escritório político, c...",004.948.028-63,GILBERTO PISELO DO NASCIMENTO,001/21,06/01/2021,Despesa com aluguel de imóvel para uso do escr...,6000,2156383
3,2021,1,ACIR GURGACZ,Divulgação da atividade parlamentar,26.320.603/0001-64,INFORMANAHORA,10,25/01/2021,Divulgação da atividade parlamentar,1500,2154509
4,2021,1,ACIR GURGACZ,Divulgação da atividade parlamentar,13.659.201/0001-47,LINHA PURPURA FOTO E VIDEO LTDA,44,07/01/2021,Divulgação da atividade parlamentar,6000,2154507


## Cleaning data

In [4]:
columns = ['ANO', 'MES', 'SENADOR', 'DATA', 'VALOR_REEMBOLSADO']

In [5]:
df_clean = df[columns]

In [6]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16808 entries, 0 to 16807
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ANO                16808 non-null  int64 
 1   MES                16808 non-null  int64 
 2   SENADOR            16808 non-null  object
 3   DATA               16808 non-null  object
 4   VALOR_REEMBOLSADO  16808 non-null  object
dtypes: int64(2), object(3)
memory usage: 656.7+ KB


In [7]:
# Apply lowercase to rows
df_clean = df_clean.astype(str).apply(lambda x: x.str.lower())

In [8]:
# Apply lowercase to columns' names
df_clean.columns = map(str.lower, df_clean.columns)

In [9]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16808 entries, 0 to 16807
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ano                16808 non-null  object
 1   mes                16808 non-null  object
 2   senador            16808 non-null  object
 3   data               16808 non-null  object
 4   valor_reembolsado  16808 non-null  object
dtypes: object(5)
memory usage: 656.7+ KB


In [10]:
# Replaces inconvenient punctuation
df_clean['valor_reembolsado'] = df_clean['valor_reembolsado'].str.replace(',', '.')
df_clean['data'] = df_clean['data'].str.replace('/', '-')

In [11]:
# Changes datetype
df_clean['ano'] = df_clean['ano'].astype(int)
df_clean['mes'] = df_clean['mes'].astype(int)
df_clean['valor_reembolsado'] = df_clean['valor_reembolsado'].astype(float)
df_clean['data'] = datetime.strptime(df_clean['data'][1], '%d-%m-%Y').date()

In [12]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16808 entries, 0 to 16807
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ano                16808 non-null  int64  
 1   mes                16808 non-null  int64  
 2   senador            16808 non-null  object 
 3   data               16808 non-null  object 
 4   valor_reembolsado  16808 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 656.7+ KB


**********************

## Clean function

In [18]:
def clean_df(df):
    columns = ['ANO', 'MES', 'SENADOR', 'VALOR_REEMBOLSADO']
    df_clean = df[columns]
    
    # Apply lowercase to rows
    df_clean = df_clean.astype(str).apply(lambda x: x.str.lower())
    
    # Apply lowercase to columns' names
    df_clean.columns = map(str.lower, df_clean.columns)
    
    # Replaces inconvenient punctuation
    df_clean['valor_reembolsado'] = df_clean['valor_reembolsado'].str.replace(',', '.')
    
    # Changes datetype
    df_clean['ano'] = df_clean['ano'].astype(int)
    df_clean['mes'] = df_clean['mes'].astype(int)
    df_clean['valor_reembolsado'] = df_clean['valor_reembolsado'].astype(float)
    
    return df_clean.head()

## Loading datasets

In [50]:
df2 =  pd.read_csv('despesa_ceaps_2020.csv', sep=',', encoding='latin1')
df3 =  pd.read_csv('despesa_ceaps_2019.csv', sep=',', encoding='latin1')
df4 =  pd.read_csv('despesa_ceaps_2018.csv', sep=',', encoding='latin1')

In [51]:
df_2021 = clean_df(df)
df_2020 = clean_df(df2)
df_2019 = clean_df(df3)
df_2018 = clean_df(df4)

## Concats dfs

In [58]:
#df_final = pd.concat([df_2021, df_2020)

In [60]:
#df_final