In [2]:
import pandas as pd
import numpy as np

In [3]:
BASE = 'afastamentos'
#BASE = 'cadastro'

In [4]:
df = pd.read_csv(BASE + '-nomes.csv', index_col=0,
                 dtype={'QTDE': 'uint8', 'PERC': np.float32})

In [5]:
df.head(30)

Unnamed: 0_level_0,QTDE,PERC
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1
MARIA,121,4.5952
JOSE,206,4.3276
ANTONIO,233,1.9667
FRANCISCO,246,1.5865
JOAO,225,1.5536
CARLOS,211,1.5317
ANA,170,1.4676
PAULO,139,1.4191
LUIZ,61,1.297
MARCOS,3,0.8058


In [6]:
df['SEXO'] = None
df['LETRAS'] = 0
df['REV'] = df.index
df['REV'] = df['REV'].apply(lambda x: x[::-1])
df['LETRAS'] = df['REV'].apply(lambda x: len(x))
df.head(30)

Unnamed: 0_level_0,QTDE,PERC,SEXO,LETRAS,REV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MARIA,121,4.5952,,5,AIRAM
JOSE,206,4.3276,,4,ESOJ
ANTONIO,233,1.9667,,7,OINOTNA
FRANCISCO,246,1.5865,,9,OCSICNARF
JOAO,225,1.5536,,4,OAOJ
CARLOS,211,1.5317,,6,SOLRAC
ANA,170,1.4676,,3,ANA
PAULO,139,1.4191,,5,OLUAP
LUIZ,61,1.297,,4,ZIUL
MARCOS,3,0.8058,,6,SOCRAM


In [7]:
df.dtypes

QTDE        uint8
PERC      float32
SEXO       object
LETRAS      int64
REV        object
dtype: object

### 1) Masculino se terminar com O, Feminino se terminar com A

In [10]:
df.loc[df['REV'].str.startswith('O'), 'SEXO'] = 'M'
df.loc[df['REV'].str.startswith('A'), 'SEXO'] = 'F'

### 2) Masculino se terminar com R, X ou L

In [11]:
df.loc[df['REV'].str.startswith('R'), 'SEXO'] = 'M'
df.loc[df['REV'].str.startswith('X'), 'SEXO'] = 'M'
df.loc[df['REV'].str.startswith('L'), 'SEXO'] = 'M'

### 3) Masculino se terminar com ...

In [12]:
sufixos = [
"DRE",
"UIS",
"QUE",
"ERT",
"ERG"
]

for suf in sufixos:
    df.loc[df['REV'].str.startswith(suf[::-1]), 'SEXO'] = 'M'

### 4) Masculino se terminar com ...

In [13]:
sufixos = [
"OS",
"ON",
"AS",
"AZ",
"US",
"ES",
"PE",
"PH",
"IM",
"NS",
"UE",
"EI",
"EY",
"EU",
"AU",
"AN",
"AC",
"IC",
"ICK",
"IK"
]

for suf in sufixos:
    df.loc[df['REV'].str.startswith(suf[::-1]), 'SEXO'] = 'M'

### 5) Feminino se terminar com ...

In [14]:
sufixos = [
"ELE",
"ELLE",
"LLE",
"ETE",
"ETTE",
"ETI",
"ETTI",
"ETH",
"ETHE",
"ISE",
"IZE",
"ITE"
]

for suf in sufixos:
    df.loc[df['REV'].str.startswith(suf[::-1]), 'SEXO'] = 'F'

### 6) Feminino se terminar com ...

In [15]:
sufixos = [
"NE",
"LI",
"LY",
"LYN",
"CE",
"EN",
"DE",
"SE",
"SY",
"ANI",
"AIS",
"AH",
"ENI",
"IE",
"IRE"
]

for suf in sufixos:
    df.loc[df['REV'].str.startswith(suf[::-1]), 'SEXO'] = 'F'

### 8) Nomes Masculinos ainda não classificados ou reclassificados

In [16]:
nomes = []
with open('nomes-masculinos.txt', 'r') as f:
    nomes = [nome.rstrip() for nome in f.readlines()]

In [17]:
for nome in nomes:
    df.loc[df['REV'] == nome[::-1], 'SEXO'] = 'M'

### 9) Nomes Femininos ainda não classificados ou reclassificados

In [18]:
nomes = []
with open('nomes-femininos.txt', 'r') as f:
    nomes = [nome.rstrip() for nome in f.readlines()]

In [19]:
for nome in nomes:
    df.loc[df['REV'] == nome[::-1], 'SEXO'] = 'F'

### Casos indefinidos

In [20]:
df[df['SEXO'].isnull()].head(30)

Unnamed: 0_level_0,QTDE,PERC,SEXO,LETRAS,REV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VALDECI,24,0.0375,,7,ICEDLAV
JURACI,13,0.0203,,6,ICARUJ
DARCY,12,0.0188,,5,YCRAD
DARCI,9,0.0141,,5,ICRAD
IRACI,9,0.0141,,5,ICARI
JACY,9,0.0141,,4,YCAJ
JACI,9,0.0141,,4,ICAJ
NELCI,6,0.0094,,5,ICLEN
UBIRACI,6,0.0094,,7,ICARIBU
JURACY,5,0.0078,,6,YCARUJ


### Testes com outros casos

In [21]:
df.describe(include=['O'])

Unnamed: 0,SEXO,REV
count,8711,9726
unique,2,9726
top,F,ENAYAHT
freq,4376,1


In [22]:
df.head(200)

Unnamed: 0_level_0,QTDE,PERC,SEXO,LETRAS,REV
PNOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MARIA,121,4.5952,F,5,AIRAM
JOSE,206,4.3276,M,4,ESOJ
ANTONIO,233,1.9667,M,7,OINOTNA
FRANCISCO,246,1.5865,M,9,OCSICNARF
JOAO,225,1.5536,M,4,OAOJ
CARLOS,211,1.5317,M,6,SOLRAC
ANA,170,1.4676,F,3,ANA
PAULO,139,1.4191,M,5,OLUAP
LUIZ,61,1.2970,M,4,ZIUL
MARCOS,3,0.8058,M,6,SOCRAM


### Testes com outros casos

In [23]:
#df.loc['PHILIP']

### Geração do arquivo final

In [24]:
df.loc[df['SEXO'].isnull(), 'SEXO'] = 'X'

In [27]:
df[['QTDE', 'SEXO']].groupby(['SEXO']).count()

Unnamed: 0_level_0,QTDE
SEXO,Unnamed: 1_level_1
F,4376
M,4335
X,1015


In [26]:
df['SEXO'].to_csv(BASE + '-nomes-sexo.csv', header=True)