In [1]:
from sys import platform
from os import path
import numpy as np
import pandas as pd
import unicodedata

In [2]:
executionPlatform = platform
if('linux' in executionPlatform):
    separator = '/'
elif('win' in executionPlatform):
    separator = '\\'

# Database of Brasilian (first) names
data_file_name = 'Data' + separator + 'genero-nomes-IBGE-CENSO2010.csv'

# Download the file if it does not exist yet
if(path.exists(data_file_name) == False):
    !wget -O Data/genero-nomes-IBGE-CENSO2010.csv https://brasil.io/dataset/genero-nomes/nomes/?format=csv

db_names_genre = pd.read_csv(data_file_name)

db_names_genre.info()
db_names_genre.head()

--2020-05-14 12:09:53--  https://brasil.io/dataset/genero-nomes/nomes/?format=csv
Resolvendo brasil.io (brasil.io)... 192.99.13.206
Conectando-se a brasil.io (brasil.io)|192.99.13.206|:443... conectado.
A requisição HTTP foi enviada, aguardando resposta... 200 OK
Tamanho: não especificada [text/csv]
Salvando em: “Data/genero-nomes-IBGE-CENSO2010.csv”

Data/genero-nomes-I     [              <=>   ]   6,45M   575KB/s    em 9,8s    

2020-05-14 12:10:04 (677 KB/s) - “Data/genero-nomes-IBGE-CENSO2010.csv” salvo [6768374]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100787 entries, 0 to 100786
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   first_name         100787 non-null  object 
 1   group_name         100787 non-null  object 
 2   classification     100787 non-null  object 
 3   frequency_female   60484 non-null   float64
 4   frequency_male     50932 non-null   float64
 5   frequency_total   

Unnamed: 0,first_name,group_name,classification,frequency_female,frequency_male,frequency_total,frequency_group,ratio,alternative_names
0,AABRAO,ABRAAO,M,,26.0,26,32296,1.0,ABRAAO|ABRAHAO|ABRAO|ABRHAO|ABRRAO|ADRAAO|ADRA...
1,AADRIANA,ADRIANA,F,94.0,,94,568459,1.0,ABRIANA|ADRAINA|ADRIANA|ADRIANNA|ADRRIANA|ADRY...
2,AADRIANO,ADRIANO,M,,53.0,53,338554,1.0,ABRIANO|ADRIANNO|ADRIANO|ADRYANO
3,AAILTON,AILTON,M,,23.0,23,246915,1.0,AELTOM|AELTON|AHILTON|AILTHON|AILTOM|AILTON|AL...
4,AALAN,ALAN,M,,27.0,27,221601,1.0,AHLAM|AILAM|AILAN|ALAAN|ALAM|ALAN|ALANN|AYLAN|...


In [3]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.split.html
db_names = db_names_genre[['alternative_names']].apply(lambda x: x.str.split(pat = "|", n=-1),  axis=1)
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html
db_names = db_names.explode('alternative_names')
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html
db_names = db_names.drop_duplicates(subset=['alternative_names'], keep='first', ignore_index=True)

db_names = db_names.rename(columns={"alternative_names": "standardized_first_name"})

db_names.info()
db_names.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72613 entries, 0 to 72612
Data columns (total 1 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   standardized_first_name  72612 non-null  object
dtypes: object(1)
memory usage: 567.4+ KB


Unnamed: 0,standardized_first_name
0,ABRAAO
1,ABRAHAO
2,ABRAO
3,ABRHAO
4,ABRRAO


In [4]:
# Example of names to check
target_audience = np.array([
    ['name',     'gender'],
    ['Luis Antônio',  'M'],
    ['Mari',  'F'],
    ['João', 'M'], 
    ['Filipe',np.NaN],
    ['Aline', 'F'],
    ['Lari',  'F'],
    ['Lelê',  'M'],
    ['Dadá', np.NaN]
])

target_audience = pd.DataFrame(
    data=target_audience[1:,0:],
    index=range(target_audience.shape[0]-1),
    columns=target_audience[0,0:]
)

target_audience

Unnamed: 0,name,gender
0,Luis Antônio,M
1,Mari,F
2,João,M
3,Filipe,
4,Aline,F
5,Lari,F
6,Lelê,M
7,Dadá,


In [5]:
# Taking the first (there are not compound names in the database)
target_audience['first_name'] = target_audience[['name']].applymap(lambda x: x.split(" ")[0])

# Standardizing the first name (remove accents and capitalize)
def standardize_string(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    unaccented_str = u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
    upper_str = unaccented_str.upper()
    return upper_str

target_audience['standardized_first_name'] = target_audience[['first_name']].applymap(standardize_string)
target_audience

Unnamed: 0,name,gender,first_name,standardized_first_name
0,Luis Antônio,M,Luis,LUIS
1,Mari,F,Mari,MARI
2,João,M,João,JOAO
3,Filipe,,Filipe,FILIPE
4,Aline,F,Aline,ALINE
5,Lari,F,Lari,LARI
6,Lelê,M,Lelê,LELE
7,Dadá,,Dadá,DADA


In [6]:
# Join the target audience with the database to keep only valid names
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
target_audience_clean = pd.merge(target_audience, db_names, left_on='standardized_first_name', right_on='standardized_first_name', how='inner', sort=False)

target_audience_clean

Unnamed: 0,name,gender,first_name,standardized_first_name
0,Luis Antônio,M,Luis,LUIS
1,Mari,F,Mari,MARI
2,João,M,João,JOAO
3,Filipe,,Filipe,FILIPE
4,Aline,F,Aline,ALINE
5,Lari,F,Lari,LARI
6,Lelê,M,Lelê,LELE


In [7]:
# Join the target audience with the clean audience to check removed names
target_audience_clean['match'] = True
target_audience_removed = pd.merge(target_audience, target_audience_clean[['standardized_first_name', 'match']], left_on='standardized_first_name', right_on='standardized_first_name', how='left', sort=False)
target_audience_removed = target_audience_removed.fillna(value={'match': False})
target_audience_removed = target_audience_removed[target_audience_removed.match == False]
target_audience_removed = target_audience_removed.drop(columns=['match'])
target_audience_clean = target_audience_clean.drop(columns=['match'])

target_audience_removed

Unnamed: 0,name,gender,first_name,standardized_first_name
7,Dadá,,Dadá,DADA
