## Read first name statistics (from DVV) from excel file to csv


In [1]:
import pandas as pd

xls = pd.ExcelFile('DVV/etunimitilasto-2022-02-07-dvv.xlsx')
# file includes sheets for statistics on all names and first names
# using statistics on first names only
female_all = pd.read_excel(xls, 'Naiset ens')
male_all = pd.read_excel(xls, 'Miehet ens')

female_all.to_csv ('DVV/female_names.csv', index = None, header=True)
male_all.to_csv ('DVV/male_names.csv', index = None, header=True)

## Parse raw first name lists 
ie remove all extra characters suchs as white spaces, html tags etc. (result of copying data from sources)

In [1]:
import re
import string

def parse_wiki_name_list(source_file, target_file):
    with open(source_file, 'r') as source, open(target_file, 'a') as target:
        for line in source:
            # ignore empty lines and titles indicating starting letter of name
            if len(line.strip()) > 1:
                target.write(line.strip() + '\n')

def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def parse_html_name_list(source_file, target_file):
    with open(source_file, 'r') as source, open(target_file, 'a') as target:
        for line in source:
            # ignore empty lines and 'continuation' tags
            if len(line) > 1 and not 'cont.' in line:
                line = remove_html(line).strip()
                # some names in the list are gender neutral - they are marked as f/m in the corresponding lists
                # remove the f/m marking
                if line.endswith(" f") or line.endswith(" m"):
                    line = line[:len(line)-2]
                target.write(line + '\n')

# parse name lists compiled from multiple sources
def parse_random_name_list(source_file, target_file):
    with open(source_file, 'r') as source, open(target_file, 'a') as target:
        for line in source:
            # ignore empty lines and comments
            if len(line.strip()) > 1 and not line.startswith("#"):
                line = remove_extra(line).strip()
                # get rid of last names, titles etc
                # note: this is done with the assumption of there being no two part names
                line = line.split()[0]
                target.write(line + '\n')

# remove all extra tags, punctuation etc (from copying from wikipedia)
def remove_extra(text):
    clean = re.compile('\[.*?\]')
    return re.sub(clean, '', text).translate(str.maketrans('', '', string.punctuation))

In [3]:
parse_wiki_name_list("Raw/fswe_female_names_raw.txt", "Parsed/fswe_female_names.txt")
parse_wiki_name_list("Raw/fswe_male_names_raw.txt", "Parsed/fswe_male_names.txt")
parse_wiki_name_list("Raw/fin_female_names_raw.txt", "Parsed/fin_female_names.txt")
parse_wiki_name_list("Raw/fin_male_names_raw.txt", "Parsed/fin_male_names.txt")

parse_html_name_list("Raw/sami_female_names_raw.txt", "Parsed/sami_female_names.txt")
parse_html_name_list("Raw/sami_male_names_raw.txt", "Parsed/sami_male_names.txt")
parse_html_name_list("Raw/rus_female_names_raw.txt", "Parsed/rus_female_names.txt")
parse_html_name_list("Raw/rus_male_names_raw.txt", "Parsed/rus_male_names.txt")

parse_random_name_list("Raw/roma_male_names_raw.txt", "Parsed/roma_male_names.txt")
parse_random_name_list("Raw/roma_female_names_raw.txt", "Parsed/roma_female_names.txt")
parse_random_name_list("Raw/som_male_names_raw.txt", "Parsed/som_male_names.txt")
parse_random_name_list("Raw/som_female_names_raw.txt", "Parsed/som_female_names.txt")

## Get top n names from each name list

Using DVV statistics on name counts 

In [15]:
# function for normalizing Sami names
from unidecode import unidecode

def normalize_name(name):
    return unidecode(name)

In [4]:
import csv

def get_all_names(file):
    result = {}
    with open(file) as all_f:
        reader = csv.DictReader(all_f)
        result = {row['Etunimi']:row['Lukumäärä'] for row in reader}
    return result

all_female_names = get_all_names("DVV/female_names.csv")
all_male_names = get_all_names("DVV/male_names.csv")

In [5]:
fin_names = None 

def get_top_n_names(eth_names, all_names, n=10, normalize=False):
    with open(eth_names) as eth:
        # TODO handle normalization for Sami names
        all_found_names = [name.strip() for name in eth if name.strip() in all_names.keys()]
        # remove names that occur in finnish top names
        if fin_names:
            all_found_names = list(set(all_found_names).difference(set(fin_names)))
    # sort names by count and select top n
    all_found_names.sort(key=lambda x: int(all_names[x]), reverse=True)
    # save to file
    top_file = f'Top/{eth_names.split("/")[1]}'
    with open(top_file, 'w') as top:
        for name in all_found_names[:n]:
            top.write(name + '\n')
    return all_found_names[:n]

In [6]:
# TODO handle special characters eg in Sami names (Suomessa nuo merkit ei virallisessa käytössä?)
n = 10
# get finnish names first to exclude them from other name lists
top_fin_female_names = get_top_n_names("Parsed/fin_female_names.txt", all_female_names, n)
top_fin_male_names = get_top_n_names("Parsed/fin_male_names.txt", all_male_names, n)
fin_names = top_fin_female_names + top_fin_male_names

# female names
top_fswe_female_names = get_top_n_names("Parsed/fswe_female_names.txt", all_female_names, n)
top_sami_female_names = get_top_n_names("Parsed/sami_female_names.txt", all_female_names, n, normalize=True)
top_rus_female_names = get_top_n_names("Parsed/rus_female_names.txt", all_female_names, n)
top_roma_female_names = get_top_n_names("Parsed/roma_female_names.txt", all_female_names, n)
top_som_female_names = get_top_n_names("Parsed/som_female_names.txt", all_female_names, n)

# male names
top_fswe_male_names = get_top_n_names("Parsed/fswe_male_names.txt", all_male_names, n)
top_sami_male_names = get_top_n_names("Parsed/sami_male_names.txt", all_male_names, n, normalize=True)
top_rus_male_names = get_top_n_names("Parsed/rus_male_names.txt", all_male_names, n)
top_roma_male_names = get_top_n_names("Parsed/roma_male_names.txt", all_male_names, n)
top_som_male_names = get_top_n_names("Parsed/som_male_names.txt", all_male_names, n)

In [7]:
# get dict for creating context sentences on top n names (additional to nationality adjectives)
import os
from transformers import BertTokenizer, BertModel, BertForMaskedLM

# import tokenizer to check for out of vocab names
tokenizer = BertTokenizer.from_pretrained('TurkuNLP/bert-base-finnish-cased-v1')

def extract_ethnicity_gender(filename):
    eth_end_i = filename.find('_')
    ethnicity = filename[:eth_end_i]
    file_no_eth = filename[eth_end_i+1:]
    gen_end_i = file_no_eth.find('_')
    gender = file_no_eth[:gen_end_i]
    return ethnicity, gender

result = {}

for filename in os.listdir("Top"):
    filepath = os.path.join("Top", filename)
    # keep track of current ethnicity and gender
    ethnicity, gender = extract_ethnicity_gender(filename)

    if ethnicity not in result:
        result[ethnicity] = {}
        
    with open(filepath) as names_f:
        result[ethnicity][gender] = [name.strip() for name in names_f
                                    if tokenizer.convert_tokens_to_ids(name.strip()) != 101]
            
result


{'fin': {'female': ['Tuula',
   'Anne',
   'Päivi',
   'Anna',
   'Ritva',
   'Leena',
   'Pirjo',
   'Sari',
   'Minna',
   'Marja'],
  'male': ['Juha',
   'Timo',
   'Matti',
   'Kari',
   'Mikko',
   'Jari',
   'Antti',
   'Jukka',
   'Mika',
   'Markku']},
 'swe': {'female': ['Laura',
   'Aino',
   'Heidi',
   'Hanna',
   'Sanna',
   'Maria',
   'Anja',
   'Johanna',
   'Paula',
   'Ulla'],
  'male': ['Janne',
   'Ville',
   'Markus',
   'Leo',
   'Kalle',
   'Elias',
   'Jesse',
   'Mikael',
   'Joel',
   'Lasse']},
 'sami': {'female': ['Heidi', 'Elen'],
  'male': ['Ville',
   'Juho',
   'Elias',
   'Joel',
   'Otto',
   'Hugo',
   'Jonne',
   'Viktor']},
 'rus': {'female': ['Nina', 'Sonja', 'Marianne', 'Eva', 'Irina', 'Marina'],
  'male': ['Marko',
   'Aleksi',
   'Anton',
   'Stefan',
   'Viktor',
   'Andrei',
   'Nikolai']},
 'roma': {'female': ['Maria', 'Elli', 'Anneli', 'Rauha', 'Hilja'],
  'male': ['Janne',
   'Markus',
   'Kalle',
   'Otto',
   'Valtteri',
   'Roope',
   'S