## Read first name statistics (from DVV) from excel file to csv


In [None]:
import pandas as pd

xls = pd.ExcelFile('etunimitilasto-2022-02-07-dvv.xlsx')
female_all = pd.read_excel(xls, 'Naiset kaikki') # TODO take all names or only first names? 
male_all = pd.read_excel(xls, 'Miehet kaikki') # TODO take all names or only first names? 

female_all.to_csv ('female_names.csv', index = None, header=True)
male_all.to_csv ('male_names.csv', index = None, header=True)

## Parse raw first name lists 
ie remove all extra characters suchs as white spaces, html tags etc. (result of copying data from sources)

In [1]:
def parse_wiki_name_list(source_file, target_file):
    with open(source_file, 'r') as source, open(target_file, 'a') as target:
        for line in source:
            # ignore empty lines and titles indicating starting letter of name
            if len(line.strip()) > 1:
                target.write(line.strip() + '\n')

In [3]:
parse_wiki_name_list("Raw/swe_female_names_raw.txt", "Parsed/swe_female_names.txt")
parse_wiki_name_list("Raw/swe_male_names_raw.txt", "Parsed/swe_male_names.txt")
parse_wiki_name_list("Raw/fin_female_names_raw.txt", "Parsed/fin_female_names.txt")
parse_wiki_name_list("Raw/fin_male_names_raw.txt", "Parsed/fin_male_names.txt")

In [2]:
import re

def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def parse_html_name_list(source_file, target_file):
    with open(source_file, 'r') as source, open(target_file, 'a') as target:
        for line in source:
            # ignore empty lines and 'continuation' tags
            if len(line) > 1 or 'cont.' in line:
                line = remove_html(line).strip()
                # some names in the list are gender neutral - they are marked as f/m in the corresponding lists
                # remove the f/m marking
                if line.endswith(" f") or line.endswith(" m"):
                    line = line[:len(line)-2]
                target.write(line + '\n')
                
parse_html_name_list("Raw/sami_female_names_raw.txt", "Parsed/sami_female_names.txt")
parse_html_name_list("Raw/sami_male_names_raw.txt", "Parsed/sami_male_names.txt")

In [3]:
# TODO parse Roma, African and Russian names

## Get top n names from each name list

Using DVV statistics on name counts 

In [3]:
import csv

def get_all_names(file):
    result = {}
    with open(file) as all_f:
        reader = csv.DictReader(all_f)
        result = {row['Etunimi']:row['Lukumäärä'] for row in reader}
    return result

all_female_names = get_all_names("female_names.csv")
all_male_names = get_all_names("male_names.csv")

In [4]:
def get_top_n_names(eth_names, all_names, n=10):
    with open(eth_names) as eth:
        all_found_names = [name.strip() for name in eth if name.strip() in all_names.keys()]
    # sort names by count and select top n
    all_found_names.sort(key=lambda x: int(all_names[x]), reverse=True)
    return all_found_names[:n]

In [8]:
# TODO how to deal with same names in both lists?
# TODO handle special characters eg in Sami names (Suomessa nuo merkit ei virallisessa käytössä?)

# female names
top_swe_female_names = get_top_n_names("Parsed/swe_female_names.txt", all_female_names, n=10)
top_fin_female_names = get_top_n_names("Parsed/fin_female_names.txt", all_female_names, n=10)
top_sami_female_names = get_top_n_names("Parsed/sami_female_names.txt", all_female_names, n=10)
# male names
top_swe_male_names = get_top_n_names("Parsed/swe_male_names.txt", all_male_names, n=10)
top_fin_male_names = get_top_n_names("Parsed/fin_male_names.txt", all_male_names, n=10)
top_sami_male_names = get_top_n_names("Parsed/sami_male_names.txt", all_male_names, n=10)

In [10]:
top_sami_female_names

['Heidi',
 'Ester',
 'Elle',
 'Siru',
 'Mimmi',
 'Gunvor',
 'Eivor',
 'Linne',
 'Unni',
 'Edel']

In [77]:
top_fin_female_names

['Maria',
 'Helena',
 'Anneli',
 'Johanna',
 'Kaarina',
 'Hannele',
 'Marjatta',
 'Kristiina',
 'Emilia',
 'Liisa']