In [1]:
import pandas as pd
import numpy as np

## Carga dos dados de entrada

In [2]:
# definir arquivo CSV de entrada
arquivo = 'afastamentos-nomes-sexo.csv'
#arquivo = 'cadastro-nomes-sexo.csv'
#arquivo = 'extra/nome_sexo_pf05.csv'

In [3]:
partes = arquivo.split('.')
prefixo_arquivo = partes[0]
#sufixo_arquivo = '.'.join(partes[1:])
novo_arquivo = prefixo_arquivo + '-preparado'

In [4]:
df = pd.read_csv(arquivo, index_col=0)
df.head()

Unnamed: 0_level_0,SEXO
PNOME,Unnamed: 1_level_1
MARIA,F
JOSE,M
ANA,F
JOAO,M
CARLOS,M


In [5]:
# remover valores nulos
df = df.loc[~df.index.isna()]
df.drop(df[df.index.isnull()].index, inplace=True)

In [6]:
df[df.index.isnull()].count()

SEXO    0
dtype: int64

In [7]:
df.rename(columns={'sexo': 'SEXO'}, inplace=True)

In [8]:
df.head(10)

Unnamed: 0_level_0,SEXO
PNOME,Unnamed: 1_level_1
MARIA,F
JOSE,M
ANA,F
JOAO,M
CARLOS,M
PAULO,M
ANTONIO,M
LUIZ,M
FRANCISCO,M
MARCELO,M


## Transformações nos dados

In [9]:
df['SEXO'] = df['SEXO'].map({'F': 0, 'M': 1, 'X': 9}).astype('uint8')

In [10]:
df['REV'] = df.index
df['REV'] = df['REV'].apply(lambda x: x[::-1])

In [11]:
df.head()

Unnamed: 0_level_0,SEXO,REV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1
MARIA,0,AIRAM
JOSE,1,ESOJ
ANA,0,ANA
JOAO,1,OAOJ
CARLOS,1,SOLRAC


In [12]:
MAIOR_QTDE_LETRAS = 16 # fixado para o maior arquivo

for i in range(MAIOR_QTDE_LETRAS):
    df['L' + str(i)] = df['REV'].apply(
        lambda x: ord(x[i]) - 64 if len(x) > i else 0
    ).astype('uint8')

In [13]:
df.drop(['REV'], axis=1, inplace=True)

In [14]:
df.dtypes

SEXO    uint8
L0      uint8
L1      uint8
L2      uint8
L3      uint8
L4      uint8
L5      uint8
L6      uint8
L7      uint8
L8      uint8
L9      uint8
L10     uint8
L11     uint8
L12     uint8
L13     uint8
L14     uint8
L15     uint8
dtype: object

## Análise dos resultados

### Amostra dos dados

In [15]:
df.head(10)

Unnamed: 0_level_0,SEXO,L0,L1,L2,L3,L4,L5,L6,L7,L8,L9,L10,L11,L12,L13,L14,L15
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
MARIA,0,1,9,18,1,13,0,0,0,0,0,0,0,0,0,0,0
JOSE,1,5,19,15,10,0,0,0,0,0,0,0,0,0,0,0,0
ANA,0,1,14,1,0,0,0,0,0,0,0,0,0,0,0,0,0
JOAO,1,15,1,15,10,0,0,0,0,0,0,0,0,0,0,0,0
CARLOS,1,19,15,12,18,1,3,0,0,0,0,0,0,0,0,0,0
PAULO,1,15,12,21,1,16,0,0,0,0,0,0,0,0,0,0,0
ANTONIO,1,15,9,14,15,20,14,1,0,0,0,0,0,0,0,0,0
LUIZ,1,26,9,21,12,0,0,0,0,0,0,0,0,0,0,0,0
FRANCISCO,1,15,3,19,9,3,14,1,18,6,0,0,0,0,0,0,0
MARCELO,1,15,12,5,3,18,1,13,0,0,0,0,0,0,0,0,0


In [16]:
print("total de linhas:", df.shape[0])

total de linhas: 48970


### Quantidades por sexo

In [17]:
df2 = df[['SEXO', 'L0']].groupby(by=['SEXO']).count().rename({'L0': 'QTDE'}, axis=1)

total = df2['QTDE'].sum()

df2['PERC'] = df2['QTDE'].apply(lambda x: int(x / total * 10000) / 100)
df2

Unnamed: 0_level_0,QTDE,PERC
SEXO,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21891,44.7
1,19259,39.32
9,7820,15.96


## Gravação do arquivo final

In [18]:
df.to_csv(novo_arquivo + '.csv.bz2', compression='bz2')

In [19]:
df.to_pickle(novo_arquivo + '.pkl.bz2')