# Birthplace (fødested) cleaning
Here I inted to clean the birthplace (fødested) field in the data. Notice that the pre-processing scripts are in city_unification (conversion to UTF8, etc.)

## Libraries

In [263]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance

#Regular expressions
import re

#Importing multiprocessing libraries
import multiprocessing

In [264]:
#Worning directories
in_folder  = '/data/import/nirama/utf8/'
out_folder = '/home/roregu/workspace/out/'
data_folder = '/home/roregu/workspace/data/'

#Column to process
focus_column = 'fødested'

#The Jaro threshold
THRESHOLD = 0.9

#Number of cores (machine = 28)
n_cores = 20

## Reading DigDag
and expanding with variations

In [265]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [266]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv(data_folder+'rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv(data_folder+'koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = '80'#dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)
del dd2

#Moving the Købstad to the right column
dd['art'] = np.where(dd['enhedtype'] == 'Købstad', ['Købstadskommune']*len(dd), dd['art'])

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn and landsogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')
dd = adding_extra_rows(dd, 'sogn', 'landsogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

#Drop 'Non' from the list, this may be one of my artifacts...
dd = dd[(dd.simplename.apply(lambda x: not 'Non' in x))]

#Adding sweeden into the equation
dd = dd.append({'navn':'Sweden','enhedid':400000,'enhedtype':'90','art':'country','simplename':'sverige'}, ignore_index=True).append({'navn':'Sweden','enhedid':400000,'enhedtype':'90','art':'country','simplename':'sverrig'}, ignore_index=True)

# TODO take into consideration that the same city name can be in varios geographical areas...
# It should be matched with herred to increase accuracy for each record. However, right now we're only cleaning
# not sure how pressing the issue is...
nr_sogn = len(dd_org[dd_org.art=='Sogn'].simplename.unique()) 
dd.head(20)

#TODO: Right now I can map things to amt for example which should not be the case...
#dd =dd[dd.art='sogn']

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


### Adding countries and islands
Countries: Barbara's list, but missing countries before the german, and italian unifications

Islands: From mads from Danmark's Stednavne (You'll find the standardized name in the "opslagsform" column)

In [267]:
#TODO

In [268]:
print('length original:\t', len(dd_org), '\nlength modified:\t', len(dd), '\nunique sogn keys:\t', nr_sogn)

length original:	 2519 
length modified:	 26874 
unique sogn keys:	 1924


## Processing functions

In [269]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [270]:
# Name cleaning function
def name_cleaner(s, working_column):
    try:
        o = s[working_column].lower().strip().replace('  ', ' ').replace('*', '').replace('?', '').replace('.', '')
        
        #If "do" (ditto= same as above) and people write what is same, then I keep what they wrote
        if o[:3] == 'do ':
            o = re.findall(r'do\.?\s\[?\(?(.+)',o)[0]
            
        #Get some clarifications
        if o[:4] == 'her ':
            o = re.findall(r'her\.?\s\[?\(?(.+)',o)[0]
            
        #if it says "her i sogn", I get the value for "Sogn" and put it there
        if ('heri sogn' in o or 'her i sogn' in o or 'her i s' == o or 'i sognet' == o[:8] or 'født i sognet' == o or 'sognet' == o or 'her' == o or 'her i byen'  == o or 'h i s'  == o or 'heri s'  == o or 'h i sognet' == o) and working_column != 'Sogne':
            return name_cleaner(s, 'Sogne')
        
        # Removing variability not catched by Jaro for Copenhagen
        if o in 'kbhvn*kbhv*kjøbenh*kjøbh*kbhvn*kbhavn*kjøbhvn':
            return 'kobenhavn'

        return o
    except:
        return np.nan

In [271]:
# List of files to work (with the 1901 collapsed)
#This I did
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

#This Nicolai
working_data = ['ft1787.txt',
'ft1801.txt',
'ft1803.txt',
'ft1834.txt',
'ft1840.txt',
'ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

#This Nicolai
working_data = ['ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

## Loading data and counting

In [276]:
#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []
total_records = 0

#for f in working_data:
 
def data_loader(f):
    
    print(f)
    #Loading the data
    df = get_df(in_folder+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    df = df[~df[focus_column].isna() & (df[focus_column] !='')]
    print(len(df))
    
    #Preforming the name cleaning, be aware that I replace "her i sogn" in there
    df[focus_column+'_data'] = df.apply(lambda row: name_cleaner(row, focus_column), axis=1)
    
    #Reducing Dimensionality and saving in RAM
    out = df.groupby(focus_column+'_data').size().reset_index(name = 'counts')
    out['year'] = f[2:6]
    return out

with multiprocessing.Pool(processes=n_cores) as pool:
    for result in pool.starmap(data_loader, zip(working_data)):
        print('done', f)
        out_list.append(result)

out_list = pd.concat(out_list, sort=False)
out_list.sort_values('counts',ascending=False)
print('Done! :D')

ft1845.txt
ft1850.txt
ft1901.txt
ft1860.txt
ft1880.txt
ft1885.txt
318056
1359777
1281746
1700886
1759664
2046684
done ft1901.txt
done ft1901.txt
done ft1901.txt
done ft1901.txt
done ft1901.txt
done ft1901.txt
Done! :D


NameError: name 'df' is not defined

In [None]:
#Putting it nicely, on a more horitzontal table
out_list = out_list.pivot(focus_column+'_data', 'year', 'counts').fillna(0).astype(int).reset_index().reset_index(drop=True)
out_list = pd.DataFrame(out_list.values, columns=out_list.columns.tolist()) #Getting rid of the wrong index
out_list['total'] = out_list.apply(lambda row: row['1845']+row['1850']+row['1860']+row['1880']+row['1885']+row['1901'],axis=1)
out_list.sort_values('total', ascending=False, inplace=True)
out_list.total.sum() #Around the 14k first places compule 80% of the people
out_list.head(50)

# Name splitting 
if there is sogn and amt in the same value I keep the amt and save the rest

In [None]:
def do_splitting(row):
    r = row[focus_column+'_data'].split(',')
    
    try:
        s = r[0]
        a = r[1]
    except:
        s = r[0]
        a = r[0]
    
    for i in range(len(r)):
        if 'sogn' in r[i] or 'købstad' in r[i]: 
            if 'sogn' == r[i].strip(' ') or 'købstad' == r[i].strip(' ') : s = r[i-1]
            else: s = r[i]
        elif 'amt' in r[i]: 
            if 'amt' == r[i].strip(' '): a = r[i-1]
            else: a = r[i]
                
    #When the comma does not separate
    if s == a and (a[-4:] != 'sogn' or a[-7:] != 'købstad'):
        if 'sogn' in a:
            r = a.split('sogn')
            return r[0], r[1]
        elif 'købstad' in a:
            r = a.split('købstad')
            return r[0], r[1]
    
    return s, a


#commas[[focus_column+'_sogn_data', focus_column+'_amt_data']] = commas[focus_column+'_data'].str.split(",", n=1,expand=True) 
out_list[focus_column+'_org'] = out_list[focus_column+'_data']
out_list[[focus_column+'_data', focus_column+'_amt_data']] = out_list.apply(do_splitting, axis=1, result_type="expand")
out_list

In [None]:
#Todo: Check wich sogn are repeated within different amts and treat them differently

## Basic matches

In [None]:
#Notice that I only match for Sogn
out = out_list.merge(dd[['navn','simplename', 'art', 'enhedid']], left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_uniquevalues_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

## Computing Jaro distances for the missmatches

In [None]:
def compute_jaro(a, bb, i):
    o=[]
    for b in bb:
        try:
            o.append(distance.get_jaro_distance(a, b))
        except:
            o.append(0)
    return i, o


mm = miss[focus_column+'_data'].unique()
sn = dd_org.simplename.unique() # Using the short version of dig dag to make it much faster (>x10)
distance_jaro = np.zeros((len(sn),len(mm)))


with multiprocessing.Pool(processes=n_cores) as pool:
    for i , result in pool.starmap(compute_jaro, zip(sn, [mm]*len(sn), range(len(sn)))):
        if i%100 == 0: print(i, 'out of' , len(sn))
        distance_jaro[i,:] = result

        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
del mm, sn
distance_jaro.head()

## Computing the best match with a minimum threshold

In [None]:
#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(out_folder+'mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()
del a
del b

#Getting the mapped info into DigDag v2 (not really the v2)
dd2 = dfmap.merge(dd[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd2['simplename'] = dd2['data_name']
dd2 = dd2[['navn','enhedid','art','simplename']]
dd2 = pd.concat([dd2[['navn','simplename', 'art', 'enhedid']], dd], sort=False) #putting together
dd2.head()

In [None]:
# Starting again with the merges
out = out_list.merge(dd2, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_jaro0.9_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

## Working with the missmatches after jaro

### Some of the missmatches have sogn in the name, so I will do another jaro for "XXXX sogn"

In [None]:
def compute_jaro(a, bb, i):
    o=[]
    for b in bb:
        try:
            o.append(distance.get_jaro_distance(a, b))
        except:
            o.append(0)
    return i, o


mm = miss[focus_column+'_data'].unique()
sn = dd[dd.simplename.str[-5:] == ' sogn'].simplename.unique() # Using the short version of dig dag to make it much faster (>x10)
distance_jaro = np.zeros((len(sn),len(mm)))


with multiprocessing.Pool(processes=n_cores) as pool:
    for i , result in pool.starmap(compute_jaro, zip(sn, [mm]*len(sn), range(len(sn)))):
        if i%100 == 0: print(i, 'out of' , len(sn))
        distance_jaro[i,:] = result

        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
del mm, sn
distance_jaro.head()

In [None]:
#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(out_folder+'mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()
del a
del b

#Getting the mapped info into DigDag v2 (not really the v2)
dd3 = dfmap.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd3['simplename'] = dd3['data_name']
dd3 = dd3[['navn','enhedid','art','simplename']]
dd3 = pd.concat([dd2[['navn','simplename', 'art', 'enhedid']], dd], sort=False) #putting together
dd3.head()

In [None]:
# Starting again with the merges
out = out_list.merge(dd3, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_jaro0.9_02.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

In [None]:
dd_org[dd_org.simplename.str[:5] == 'køben']

In [None]:
distance.get_jaro_distance('kjøbenhavn', 'københavn')

In [None]:
print(miss.shape)
miss.head(50)

In [None]:

0/0


# Adding DigDag info and saving 

In [None]:
# Getting the digDag info for the matches but also for the non matches

m = concatenate.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename',keep='first'), left_on='potential_match', right_on='simplename', how='inner')
m = m[['original','potential_match','score','navn','art','enhedid', 'simplename']]

w = dd2.sort_values('art', ascending=False).drop_duplicates('simplename',keep='first')
w['potential_match'] = w['simplename']
w['original'] = w['simplename']
w['score'] = 1

m = pd.concat([w, m], sort=True)


#Putting the things into place
m['original'] = np.where(m.original.isna(), m['simplename'], m['original'])

print(len(concatenate), len(m))
m.head()

In [None]:
#Getting the counts for the original plus the counts for the potential matches in DigDag
#Warning: Potential matches may not exist in the data...
out = m.merge(out_list, left_on='original', right_on=focus_column+'_data', how='left').merge(out_list, left_on='potential_match', right_on=focus_column+'_data', how='left').fillna(0)
for y in ['1845', '1850', '1860', '1880', '1885', '1901']:
    out[y] = out[y+'_x'] + out[y+'_y']
out = out[['original', 'potential_match', 'score', 'navn', 'art', 'enhedid', '1845', '1850', '1860', '1880', '1885', '1901']].sort_values('original')
out.to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', sep='\t', index=False)

In [None]:
print('Done! :D')

## Generating file to check the validity of the mapping jaro 0.9

In [None]:
import random
import pandas as pd

df = pd.read_csv(out_folder+'birthplaces_FT_jaro0.9_01.tsv', sep = '\t', dtype=str)

max_example = 5

names    = df.navn.tolist()
extendeds = df.extended_digdag.tolist()

out = []
for name, extended in zip(names, extendeds):
    extended = list(eval(extended))
    #m = min(len(extended), max_example)
    samp = random.sample(extended, k= min(len(extended), max_example))
    for s in samp:
        out.append([name, s])

dfout = pd.DataFrame(out, columns= ['DigDag', 'matches (0.9)'])
dfout.to_csv(out_folder+'validity_jaro.tsv', sep='\t', index=False)

! cp $out_folder/validity_jaro.tsv /data/import/roc/validity_jaro.tsv

print('Saved!')