# Import

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import datetime

from googletrans import Translator

# MyDataFrame Class

In [18]:
class MyDataFrame: 
    def __init__(self, df, translate=False, translate_first_level=False, white_space=False, drop_level=True):
        
        self.df = df
        self.translator = Translator()
        self.title = ''
        self.unit = ''
        self.footer = ''
        self.translate = translate
        self.translate_first_level = translate_first_level
        self.white_space= white_space
        self.drop_level = drop_level
        #self.ratio = (((self.df.iloc[:,-2]-self.df.iloc[:,-3])/self.df.iloc[:,-3])*100).round()
        #self.last_column = self.df.iloc[:,-1].replace('..', 0).round()
        
        self.drop_na()
        
        if self.df.index.nlevels > 1:
            self.title_unit_multiindex();
        
        if self.drop_level:
            self.drop_levels()
            
        #if (self.ratio == self.last_column).all():
        #   self.drop_last_column()
        
        
        if self.translate:
            if self.df.index.nlevels == 1:
                self.translate_index()
                
            
        if self.translate:
            if self.df.index.nlevels > 1:
                self.translate_multi_index()
        
        
        if self.white_space:
            self.replace_white_space()
            
            
    def title_unit_multiindex(self):
        self.title = self.df.columns[0][0]
        self.unit = self.df.columns[1][1]
            
    def drop_levels(self):
        """
        This method drops two column levels that contained the infos previously captured (table's title and unit)
        """     
        while self.df.columns.nlevels>1:
            self.df.columns = self.df.columns.droplevel(0)
            
            
    def drop_na(self):
        """
        This method drops all rows and columns that have all values equals to NaN.
        """  
        self.df.dropna(how = 'all', inplace = True)
        self.df.dropna(axis = 'columns', how = 'all', inplace = True)

    
    def translate_index(self):
        """
        This method translates the index of a DataFrame to English.
        """
        self.new_index = []
        for index in self.df.index:
            if index == 'Reinjeção':
                self.new_index.append('Reinjection')
            elif (index == 'Espírito Santo') or (index == 'Espirito_Santo'):
                self.new_index.append('Espirito_Santo')
            elif index == 'Amazonas':
                self.new_index.append('Amazonas')
            elif index == 'Alagoas':
                self.new_index.append('Alagoas')
            elif (index == 'Ceará') or (index == 'Ceara'):
                self.new_index.append('Ceara')
            elif (index == 'Rio Grande do Norte') or (index == 'Rio_Grande_do_Norte'):
                self.new_index.append('Rio_Grande_do_Norte')
            else:
                self.new_index.append(self.translator.translate(index).text)
        self.df.index = self.new_index
        
                        
    def translate_multi_index(self):
        """
        This method translates a MultiIndex DataFrame to English.
        """
        
        if self.translate_first_level == True:
            for i, num in enumerate(self.df.index):
                    for j in range(self.df.index.nlevels):       
                        if j==0:
                            if (self.df.index[i][j] == 'Espírito_Santo') or (self.df.index[i][j] == 'Espirito_Santo'):
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Espírito_Santo','Espirito_Santo'), level = j)
                            elif self.df.index[i][j] == 'Amazonas':
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Amazonas','Amazonas'), level = j)
                            elif self.df.index[i][j] == 'Alagoas':
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Alagoas','Alagoas'), level = j)
                            elif (self.df.index[i][j] == 'Ceará') or (self.df.index[i][j] == 'Ceara'):
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Ceará','Ceara'), level = j)
                            elif (self.df.index[i][j] == 'Rio Grande do Norte') or (self.df.index[i][j] == 'Rio_Grande_do_Norte'):
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace(' ','_'), level = j)
                            else:
                                self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace(self.df.index[i][j], self.translator.translate(self.df.index[i][j]).text), level = j)
        
        for i, num in enumerate(self.df.index):
            for j in range(self.df.index.nlevels):
                if j==0:
                    pass
                if j==1:
                    if self.df.index[i][j] == 'Mar': # checks if one of the words that the translate package can not translate
                        self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Mar','Offshore'), level = j)
                    elif self.df.index[i][j] == 'Terra': # checks if one of the words that the translate package can not translate
                        self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace('Terra','Onshore'), level = j)
                    elif not isinstance(self.df.index[i][j], str):
                        pass
                    else:
                        self.df.index = self.df.index.set_levels(self.df.index.levels[j].str.replace(self.df.index[i][j], self.translator.translate(self.df.index[i][j]).text), level = j)
        
        
    def replace_underscore(self):
        """
        This method replaces all underscore for white space.
        """
        if self.df.index.nlevels > 1: # tells how many level are
            for i, level in enumerate(range(self.df.index.nlevels)): # runs through levels
                #for j, value in enumerate(self.df.index.levels[i]): # runs through the level's value and replace white space for underline
                self.df.index = self.df.index.set_levels(self.df.index.levels[i].str.replace("_", " "), level = i)
        
        elif self.df.index.nlevels == 1:
            self.new_index = []
            for index in self.df.index:
                self.new_index.append(index.replace('_', ' '))
            self.df.index = self.new_index
            
            
    def drop_last_column(self):
        """
        This method drops the last column if it is a ration between columns[-2] and columns[-3] .
        """            
        self.df = self.df.drop(self.df.columns[-1], axis=1)

# Load and Wrangling

### Total Reserves

In [79]:
total_reserves = MyDataFrame(pd.read_excel(r'data_set/anuario-2020-tabela-2_5.xls',  header = [0,2,3], index_col = [0,1]), translate=True,)



In [97]:
#dropping last columns
total_reserves.drop_last_column()

# Setting unit, title and footer
total_reserves.unit = '10⁶ m³'
total_reserves.title = 'Total¹ Reserves of Natural Gas, by Location (Shore and Offshore), according to Federation² Units - 2010-2019'
total_reserves.footer ='Reserves at 31/12 of the reference years.'

# Correcting Index Supscrit
total_reserves.df.index[-5][0].replace('3','³')
total_reserves.df.index[-4][0].replace('4','⁴')
total_reserves.df.index[-2][0].replace('5','⁵')
total_reserves.df.index[-1][0].replace('6','⁶')


'Santa Catarina⁶'

In [99]:
for i, index in enumerate(total_reserves.df.index.levels[0]):
    total_reserves.df.index.levels[0][i].replace('1','¹')
    total_reserves.df.index.levels[0][i].replace('2','²')
    total_reserves.df.index.levels[0][i].replace('3','³')
    total_reserves.df.index.levels[0][i].replace('4','⁴')
    total_reserves.df.index.levels[0][i].replace('5','⁵')
    total_reserves.df.index.levels[0][i].replace('6','⁶')
    total_reserves.df.index.levels[0][i].replace('7','⁷')
    total_reserves.df.index.levels[0][i].replace('8','⁸')
    total_reserves.df.index.levels[0][i].replace('9','⁹')

In [48]:
total_reserves.df.index[-5][0].replace('3','³')
total_reserves.df.index[-4][0].replace('4','⁴')
total_reserves.df.index[-2][0].replace('5','⁵')
total_reserves.df.index[-1][0].replace('6','⁶')


'Santa Catarina⁶'

In [6]:
processing = MyDataFrame(pd.read_excel(r'data_set/anuario-2020-tabela-2_30.xls',  header = [0,2,3], index_col = [0]))

In [7]:
proved_reserves = MyDataFrame(pd.read_excel(r'data_set/table-1_2.xls', header = [0,2,3], index_col = [0,1]))
proved_reserves.drop_last_column()

In [8]:
domestic_sale = MyDataFrame(pd.read_excel(r'data_set/table-1_25.xls', header = [0,2,3], index_col = [0]))
domestic_sale.drop_last_column()
domestic_sale.unit = '10⁶ m³'

In [9]:
balance = MyDataFrame(pd.read_excel(r'data_set/table-1_26.xls', header = [0,2,3], index_col = [0]))
balance.drop_last_column()

In [10]:
production = MyDataFrame(pd.read_excel(r'data_set/table-1_5.xls', header = [0,2,3], index_col = [0,1]))
production.drop_last_column()
production.unit='10⁶ m³'

In [11]:
gas_type = MyDataFrame(pd.read_excel(r'data_set/table-1_6.xls', header = [0,2,3], index_col = [0,1]))
gas_type.drop_last_column()
gas_type.unit='10⁶ m³'

In [12]:
avg_price = MyDataFrame(pd.read_excel(r'data_set/anuario-2020-tabela-2_24.xls',  header = [0,2,3,4], index_col = [0]), drop_level=False)
avg_price.drop_levels()
avg_price.unit='US$/mil m³'

In [13]:
energy_matrix = pd.read_csv(r'data_set/GeracaoFonte.csv')
energy_matrix.columns = ['ID', 'Energy Type', 'GWh', 'Month', 'Year', 'dthProx']
energy_matrix = pd.pivot_table(energy_matrix, values='GWh', index=['Energy Type'], columns=['Year', 'Month'])

In [15]:
bra_ind = pd.read_csv(r'data_set/API_BRA_DS2_en_csv_v2_1221399.csv', skiprows=[0,1,2,3])