# Índice
1. [Configuração](#Configuração)
2. [Inspeção dos dados brutos](#Inspeção-dos-dados-brutos)
3. [Inspeção dos arquivos principais](#Inspeção-dos-arquivos-principais)
4. [Inspeção dos arquivos auxiliares](#Inspeção-dos-arquivos-auxiliares)

# Configuração

In [1]:
import pandas as pd
import sys
from IPython.core.display import HTML

pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 300)
display(HTML("<style>.container { width:100% !important; } div.prompt {min-width: 10ex;} .rendered_html table { font-size: 11px; }</style>"))

## Inspeção dos dados brutos

Estas são as primeiras linhas do arquivo CSV referente às exportações:

In [2]:
exports_zip = '../data/exp.zip'
!zcat $exports_zip | head -n 4

"CO_ANO";"CO_MES";"CO_NCM";"CO_UNID";"CO_PAIS";"SG_UF_NCM";"CO_VIA";"CO_URF";"QT_ESTAT";"KG_LIQUIDO";"VL_FOB"
"1997";"02";"84659110";"11";"548";"SP";"01";"0817800";2;1295;4294
"1997";"01";"84798999";"11";"158";"SP";"04";"0817700";6;352;1144
"1997";"01";"84819090";"10";"764";"SP";"04";"0817600";0;0;281

gzip: stdout: Broken pipe


No caso das importações, as primeiras linhas do CSV são assim:

In [3]:
imports_zip = '../data/imp.zip'
!zcat $imports_zip | head -n 4

"CO_ANO";"CO_MES";"CO_NCM";"CO_UNID";"CO_PAIS";"SG_UF_NCM";"CO_VIA";"CO_URF";"QT_ESTAT";"KG_LIQUIDO";"VL_FOB"
"1997";"12";"90072090";"11";"399";"AM";"01";"0227600";15;105;7184
"1997";"12";"30062000";"10";"767";"MG";"04";"0617600";6104;4855;563241
"1997";"12";"85331000";"11";"399";"SP";"04";"0817700";1000119;61;11826

gzip: stdout: Broken pipe


## Inspeção dos arquivos principais

In [7]:
def read_and_group_file_in_chunks(file_name, verbose=False, chunksize=1000000):
    grouped_chunks = []
    if verbose:
        sys.stderr.write('Reading file in chunks of {} lines'.format(chunksize))
        sys.stderr.flush()
    # TODO: Load only the columns needed, with usecols=['foo', 'bar']
    for chunk in pd.read_csv(file_name, delimiter=";", chunksize=chunksize):
        if verbose:
            sys.stderr.write('.')
            sys.stderr.flush()
        grouped_chunks.append(chunk)
    if verbose:
        sys.stderr.write('\nConcatenating {} chunks\n'.format(
                         len(grouped_chunks)))
        sys.stderr.flush()
    return pd.concat(grouped_chunks)

In [8]:
df_exp = read_and_group_file_in_chunks(exports_zip, verbose=True)

Reading file in chunks of 1000000 lines.......................
Concatenating 23 chunks


In [9]:
df_exp.sample(5)

Unnamed: 0,CO_ANO,CO_MES,CO_NCM,CO_UNID,CO_PAIS,SG_UF_NCM,CO_VIA,CO_URF,QT_ESTAT,KG_LIQUIDO,VL_FOB
366891,1997,4,84148019,11,63,SP,7,1010900,8,94,2032
7430615,2007,4,94036000,11,40,RS,1,920600,52,22256,100367
25079,1997,3,55161200,10,586,MG,9,910600,0,123,841
6940802,2006,11,39162000,10,586,SC,9,910600,0,4225,6141
7900288,2007,5,59069900,10,776,RS,4,817700,369,394,4329


In [10]:
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22648636 entries, 0 to 22648635
Data columns (total 11 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   CO_ANO      int64 
 1   CO_MES      int64 
 2   CO_NCM      int64 
 3   CO_UNID     int64 
 4   CO_PAIS     int64 
 5   SG_UF_NCM   object
 6   CO_VIA      int64 
 7   CO_URF      int64 
 8   QT_ESTAT    int64 
 9   KG_LIQUIDO  int64 
 10  VL_FOB      int64 
dtypes: int64(10), object(1)
memory usage: 1.9+ GB


In [11]:
def fix_dtypes(df):
    # Store categorical columns as such, reducing the memory usage
    # for the exports file from 1.9+ GB to 1.0 GB
    for col in df.columns[2:-3]:
        df[col]=df[col].astype('category')
    return df

In [13]:
df_exp = fix_dtypes(df_exp)
df_exp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22648636 entries, 0 to 22648635
Data columns (total 11 columns):
 #   Column      Dtype   
---  ------      -----   
 0   CO_ANO      int64   
 1   CO_MES      int64   
 2   CO_NCM      category
 3   CO_UNID     category
 4   CO_PAIS     category
 5   SG_UF_NCM   category
 6   CO_VIA      category
 7   CO_URF      category
 8   QT_ESTAT    int64   
 9   KG_LIQUIDO  int64   
 10  VL_FOB      int64   
dtypes: category(6), int64(5)
memory usage: 1.0 GB


In [14]:
df_exp.describe(include='category')

Unnamed: 0,CO_NCM,CO_UNID,CO_PAIS,SG_UF_NCM,CO_VIA,CO_URF
count,22648636,22648636,22648636,22648636,22648636,22648636
unique,11713,13,264,32,14,266
top,87089990,10,63,SP,1,817800
freq,134984,13534129,2091032,9216997,8614909,5022913


In [15]:
df_exp.describe(exclude='category')

Unnamed: 0,CO_ANO,CO_MES,QT_ESTAT,KG_LIQUIDO,VL_FOB
count,22648640.0,22648640.0,22648640.0,22648640.0,22648640.0
mean,2009.973,6.599953,614753.6,486321.4,160909.6
std,6.504889,3.384929,484781400.0,27099770.0,3224087.0
min,1997.0,1.0,0.0,0.0,0.0
25%,2005.0,4.0,1.0,7.0,264.0
50%,2010.0,7.0,23.0,112.0,2292.0
75%,2016.0,9.0,540.0,3225.0,20603.0
max,2020.0,12.0,990898800000.0,17750190000.0,1936700000.0


In [16]:
df_imp = read_and_group_file_in_chunks(imports_zip, verbose=True)

Reading file in chunks of 1000000 lines..................................
Concatenating 34 chunks


In [17]:
df_imp.sample(5)

Unnamed: 0,CO_ANO,CO_MES,CO_NCM,CO_UNID,CO_PAIS,SG_UF_NCM,CO_VIA,CO_URF,QT_ESTAT,KG_LIQUIDO,VL_FOB
10333331,2007,2,42023100,11,160,DF,1,817800,10224,1178,2439
29150623,2018,2,39269069,10,160,RJ,4,817700,0,0,334
5957379,2003,4,84145990,11,161,RS,4,1017600,4,0,36
24261576,2015,8,39211390,10,160,AM,4,227700,4,4,234
3564720,2000,3,29159090,10,628,SP,4,817600,6,6,620


In [18]:
df_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33783634 entries, 0 to 33783633
Data columns (total 11 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   CO_ANO      int64 
 1   CO_MES      int64 
 2   CO_NCM      int64 
 3   CO_UNID     int64 
 4   CO_PAIS     int64 
 5   SG_UF_NCM   object
 6   CO_VIA      int64 
 7   CO_URF      int64 
 8   QT_ESTAT    int64 
 9   KG_LIQUIDO  int64 
 10  VL_FOB      int64 
dtypes: int64(10), object(1)
memory usage: 2.8+ GB


In [19]:
df_imp = fix_dtypes(df_imp)
df_imp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33783634 entries, 0 to 33783633
Data columns (total 11 columns):
 #   Column      Dtype   
---  ------      -----   
 0   CO_ANO      int64   
 1   CO_MES      int64   
 2   CO_NCM      category
 3   CO_UNID     category
 4   CO_PAIS     category
 5   SG_UF_NCM   category
 6   CO_VIA      category
 7   CO_URF      category
 8   QT_ESTAT    int64   
 9   KG_LIQUIDO  int64   
 10  VL_FOB      int64   
dtypes: category(6), int64(5)
memory usage: 1.5 GB


In [20]:
df_exp.describe(include='category')

Unnamed: 0,CO_NCM,CO_UNID,CO_PAIS,SG_UF_NCM,CO_VIA,CO_URF
count,22648636,22648636,22648636,22648636,22648636,22648636
unique,11713,13,264,32,14,266
top,87089990,10,63,SP,1,817800
freq,134984,13534129,2091032,9216997,8614909,5022913


In [21]:
df_exp.describe(exclude='category')

Unnamed: 0,CO_ANO,CO_MES,QT_ESTAT,KG_LIQUIDO,VL_FOB
count,22648640.0,22648640.0,22648640.0,22648640.0,22648640.0
mean,2009.973,6.599953,614753.6,486321.4,160909.6
std,6.504889,3.384929,484781400.0,27099770.0,3224087.0
min,1997.0,1.0,0.0,0.0,0.0
25%,2005.0,4.0,1.0,7.0,264.0
50%,2010.0,7.0,23.0,112.0,2292.0
75%,2016.0,9.0,540.0,3225.0,20603.0
max,2020.0,12.0,990898800000.0,17750190000.0,1936700000.0


## Inspeção dos arquivos auxiliares

In [None]:
aux_file = '../data/aux.xlsx'
xls = pd.ExcelFile(aux_file)
print(xls.sheet_names)

In [None]:
dfs = pd.read_excel(aux_file, sheet_name=None)
dfs['ÍNDICE']

In [None]:
for i, sheet_name in enumerate(dfs):
    print('{}:\t{}'.format(sheet_name, dfs[sheet_name].shape))

In [None]:
for sheet_name in dfs:
    print('== Sheet "{}" =='.format(sheet_name))
    display(dfs[sheet_name].sample(5))
    print(dfs[sheet_name].info())