# Birthplace (fødested) cleaning
Here I inted to clean the birthplace (fødested) field in the data. Notice that the pre-processing scripts are in city_unification (conversion to UTF8, etc.)

## Libraries

In [13]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance
#import Levenshtein

In [None]:
in_folder  = 
out_folder =
roc_folder = 

## Reading DigDag
and expanding with variations

In [14]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [15]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv('../data/rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv('../data/koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)


#Moving the Købstad to the right column
dd['art'] = np.where(dd['enhedtype'] == 'Købstad', ['Købstadskommune']*len(dd), dd['art'])

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn and landsogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')
dd = adding_extra_rows(dd, 'sogn', 'landsogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

#Drop 'Non' from the list, this may be one of my artifacts...
dd = dd[(dd.simplename.apply(lambda x: not 'Non' in x))]

# TODO take into consideration that the same city name can be in varios geographical areas...
# It should be matched with herred to increase accuracy for each record. However, right now we're only cleaning
# not sure how pressing the issue is...
nr_sogn = len(dd_org[dd_org.art=='Sogn'].simplename.unique()) 
dd.head(20)

#TODO: Right now I can map things to amt for example which should not be the case...
#dd =dd[dd.art='sogn']

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


In [16]:
print('length original:\t', len(dd_org), '\nlength modified:\t', len(dd), '\nunique sogn keys:\t', nr_sogn)

length original:	 2519 
length modified:	 27097 
unique sogn keys:	 1924


## Processing functions

In [17]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [18]:
# Name cleaning function
def name_cleaner(s, working_column):
    try:
        o =s[working_column].lower().rstrip().replace('  ', ' ')
        #if it says "her i sogn", I get the value for "Sogn" and put it there
        if ('heri sogn' in o or 'her i sogn' in o) and working_column != 'Sogne':
            return name_cleaner(s, 'Sogne')
        return o
    except:
        return np.nan

In [19]:
# List of files to work (with the 1901 collapsed)
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

## Loading data and counting

In [20]:
_path = '../utf8/'
focus_column = 'fødested'

In [21]:
#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []
total_records = 0

for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing (ask Barbara to provide me the UTF-8...)
    df = df[~df[focus_column].isna() & (df[focus_column] !='')]
    print(len(df))
    total_records = total_records + len(df)
    
    #Preforming the name cleaning, be aware that I replace "her i sogn" in there
    df[focus_column+'_data'] = df.apply(lambda row: name_cleaner(row, focus_column), axis=1)
    
    #Reducing Dimensionality and saving in RAM
    out = df.groupby(focus_column+'_data').size().reset_index(name = 'counts')
    out['year'] = f[2:6]
    out_list.append(out)

    
out_list = pd.concat(out_list, sort=False)
print('Done! :D')

out_list.sort_values('counts',ascending=False)

ft1845_LL.txt
1415056
ft1850_LL.txt
1383276
ft1860_LL.txt
1715130
ft1880_LL.txt
1960921
ft1885_LL.txt
325340
ft1901_LL.txt
1975438
Done! :D


Unnamed: 0,fødested_data,counts,year
19850,kjøbenhavn,134653,1885
53400,kjøbenhavn,85495,1860
78465,kjøbenhavn,75245,1880
83080,københavn,68462,1880
58603,kjøbenhavn,46942,1845
...,...,...,...
87220,"staby sogn,ringkøbing amt",1,1860
87221,staby sogn. ringkjøbing amt,1,1860
87223,"staby, rgk.a.",1,1860
87227,"staby, ringkjøbing a",1,1860


In [22]:
#Putting it nicely, on a more horitzontal table
out_list = out_list.pivot(focus_column+'_data', 'year', 'counts').fillna(0).astype(int).reset_index().reset_index(drop=True)
out_list = pd.DataFrame(out_list.values, columns=out_list.columns.tolist()) #Getting rid of the wrong index
out_list['total'] = out_list.apply(lambda row: row['1845']+row['1850']+row['1860']+row['1880']+row['1885']+row['1901'],axis=1)
out_list.sort_values('total', ascending=False, inplace=True)
out_list.total.sum() #Around the 14k first places compule 80% of the people
out_list.head(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total
281450,kjøbenhavn,46942,44015,85495,75245,134653,22607,408957
298127,københavn,21034,9335,16230,68462,15581,40997,171639
369718,odense,8357,6530,9279,13408,3020,22562,63156
251691,i sognet,15795,26408,12990,4689,0,67,59949
21199,aarhus,4063,4594,1710,12183,1859,22638,47047
19484,aalborg,5986,2750,8049,10335,2160,15334,44614
230135,horsens,4268,3117,5708,8584,1414,12302,35393
207802,helsingør,1190,5830,6776,7832,3040,7490,32158
106756,do.,10,23181,6,5249,0,0,28446
387680,randers,5773,4068,7421,3934,1811,4474,27481


## Basic matches

In [11]:
#Notice that I only match for Sogn
out = out_list.merge(dd[['navn','simplename', 'art', 'enhedid']], left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(_path+'../out/birthplaces_FT_uniquevalues_01.tsv', sep='\t', index=False)
print(len(s))
s.head()

2065


Unnamed: 0,navn,enhedid,art,1845,1850,1860,1880,1885,1901,extended_digdag
0,Aabenraa Sogn,113932,sogn,32,16,3667,266,105,133,"{aabenraa sogn, aabenraa}"
1,Aaker Sogn,115247,sogn,0,1783,2058,2667,2,280,"{aaker, aaker sogn, aakersogn}"
2,Aal Sogn,115248,sogn,686,775,793,226,0,48,"{aal sogn, aalsogn, aal}"
3,Aarestrup Sogn,115255,sogn,299,284,344,222,1,35,"{aarestrup, aarestrup sogn}"
4,Abild Sogn,113795,sogn,24,1,36,1,3,9,"{abild, abild sogn}"


## Computing Jaro distances for the missmatches

In [12]:
#Getting the potential first matches

mm = miss[focus_column+'_data'].unique()
sn = dd_org.simplename.unique() # Using the short version of dig dag to make it much faster (>x10)

distance_jaro = np.zeros((len(sn),len(mm)))

def compute_jaro(a,b):
    try:
        return distance.get_jaro_distance(a, b)
    except:
        return 0

for i in range(len(sn)):
    if i%100 == 0: print(i, 'out of' , len(sn))
        
    #Computing jaro only
    distance_jaro[i] = miss[focus_column+'_data'].apply(lambda x: compute_jaro(sn[i], x)).values
    
        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
distance_jaro.head()

0 out of 2083
100 out of 2083
200 out of 2083
300 out of 2083
400 out of 2083
500 out of 2083
600 out of 2083
700 out of 2083
800 out of 2083
900 out of 2083
1000 out of 2083
1100 out of 2083
1200 out of 2083
1300 out of 2083
1400 out of 2083
1500 out of 2083
1600 out of 2083
1700 out of 2083
1800 out of 2083
1900 out of 2083
2000 out of 2083
(2083, 606466) len mm,sn: 606466 2083


Unnamed: 0,kjøbenhavn,i sognet,do.,sverrig,sverige,slesvig,kiøbenhavn,??,-,her i s.,...,"høien, nør jylland","høien, holbæk","høien, aarhuus amt","høien,",høien weile amt,høien vejle,høien veile amt??,høien veile,høien v a,ẅẅs. vilstrup . vejle
kronborg,0.5,0.47,0.49,0.6,0.51,0.42,0.5,0.0,0.0,0.42,...,0.45,0.47,0.45,0.43,0.4,0.41,0.39,0.41,0.41,0.39
præstø,0.42,0.53,0.0,0.37,0.37,0.44,0.42,0.0,0.0,0.53,...,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4
bornholms,0.54,0.49,0.48,0.42,0.42,0.0,0.54,0.0,0.0,0.46,...,0.28,0.5,0.28,0.43,0.4,0.42,0.39,0.42,0.31,0.44
svendborg,0.43,0.59,0.0,0.83,0.77,0.63,0.43,0.0,0.0,0.41,...,0.5,0.52,0.5,0.52,0.45,0.47,0.45,0.47,0.48,0.49
ålborg,0.42,0.53,0.0,0.54,0.54,0.54,0.42,0.0,0.0,0.43,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
distance_jaro.to_csv(_path+'../out/jaro_birthplaces.tsv', sep='\t', index=False)

## Computing the best match with a minimum threshold

In [14]:
THRESHOLD = 0.9

#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(_path+'../out/mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()

#Getting the mapped info into DigDag v2 (not really the v2)
dd2 = dfmap.merge(dd[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd2['simplename'] = dd2['data_name']
dd2 = dd2[['navn','enhedid','art','simplename']]
dd2 = pd.concat([dd2[['navn','simplename', 'art', 'enhedid']], dd], sort=False) #putting together
dd2.head()

Unnamed: 0,navn,simplename,art,enhedid,enhedtype
0,Staden København,kjøbenhavn,købstad,120663,
1,Staden København,kiøbenhavn,købstad,120663,
2,Staden København,kjöbenhavn,købstad,120663,
3,Staden København,københavn.,købstad,120663,
4,Staden København,kiöbenhavn,købstad,120663,


In [15]:
# Starting again with the merges
out = out_list.merge(dd2, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(_path+'../out/birthplaces_FT_jaro0.9_01.tsv', sep='\t', index=False)
print(len(s))
s.head()

2179


Unnamed: 0,navn,enhedid,art,1845,1850,1860,1880,1885,1901,extended_digdag
0,Aabenraa Sogn,113932,sogn,41,19,3698,288,118,158,"{v. aabenraa, aabenbaa, (aabenraa), aabenrade,..."
1,Aaker Sogn,115247,sogn,17,1800,2061,2668,5,292,"{aalkjer, aakker, aaker sogn, aakjer, aakerby,..."
2,Aal Sogn,115248,sogn,687,777,800,233,0,53,"{aal., aals, aal??, aalle, aalsogn, aal, aalse..."
3,Aarestrup Sogn,115255,sogn,337,298,370,370,8,490,"{gaarstrup, aarstrup p f, aardetsrup, aarrestr..."
4,Abild Sogn,113795,sogn,29,66,37,2,10,27,"{abildhave, aabild, abbild, abildtorp, abildha..."


## Getting counts for other possible matches (broke here)

In [16]:
# Get the best possible matches

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[(concatenate.score >= THRESHOLD) & ~concatenate.original.isna()].original.unique())].to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', index=False, sep='\t')

concatenate = pd.read_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv',  sep='\t')
concatenate.head(15)

100 out of 606466
200 out of 606466
300 out of 606466
400 out of 606466
500 out of 606466
600 out of 606466
700 out of 606466
800 out of 606466
900 out of 606466
1000 out of 606466
1100 out of 606466
1200 out of 606466
1300 out of 606466
1400 out of 606466
1500 out of 606466
1600 out of 606466
1700 out of 606466
1800 out of 606466
1900 out of 606466
2000 out of 606466
2100 out of 606466
2200 out of 606466
2300 out of 606466
2400 out of 606466
2500 out of 606466
2600 out of 606466
2700 out of 606466
2800 out of 606466
2900 out of 606466
3000 out of 606466
3100 out of 606466
3200 out of 606466
3300 out of 606466
3400 out of 606466
3500 out of 606466
3600 out of 606466
3700 out of 606466
3800 out of 606466
3900 out of 606466
4000 out of 606466
4100 out of 606466
4200 out of 606466
4300 out of 606466
4400 out of 606466
4500 out of 606466
4600 out of 606466
4700 out of 606466
4800 out of 606466
4900 out of 606466
5000 out of 606466
5100 out of 606466
5200 out of 606466
5300 out of 606466
54

KeyboardInterrupt: 

In [None]:
0/0

In [3]:
import pandas as pd
import numpy as np

In [4]:
%timeit pd.read_csv('/home/roregu/workspace/aux.tsv', sep = '\t', usecols=[0]+list(np.arange(1,3)), index_col=0 )

43.9 s ± 2.32 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit pd.read_csv('/home/roregu/workspace/aux.tsv', sep = '\t', index_col=0 )

In [1]:
n_cores = 15
jump=int(606466/n_cores) #len(distance_jaro.columns))
for i in range(0,606466,jump):
    print('python top5_hits_par.py '+str(i)+' '+str(jump+i)+' &')

python top5_hits_par.py 0 40431 &
python top5_hits_par.py 40431 80862 &
python top5_hits_par.py 80862 121293 &
python top5_hits_par.py 121293 161724 &
python top5_hits_par.py 161724 202155 &
python top5_hits_par.py 202155 242586 &
python top5_hits_par.py 242586 283017 &
python top5_hits_par.py 283017 323448 &
python top5_hits_par.py 323448 363879 &
python top5_hits_par.py 363879 404310 &
python top5_hits_par.py 404310 444741 &
python top5_hits_par.py 444741 485172 &
python top5_hits_par.py 485172 525603 &
python top5_hits_par.py 525603 566034 &
python top5_hits_par.py 566034 606465 &
python top5_hits_par.py 606465 646896 &


In [50]:
#This is the script I used to "parallelize" the selection of the top 5 matches per potential matches
import pandas as pd
import numpy as np
import sys
import os

_from = int(sys.argv[1])
_to = int(sys.argv[2])
_top = 5 #Top what? sys.argv[1]
THRESHOLD = 0.9

if os.path.isfile('/home/roregu/workspace/tmp/concatenate_'+str(_from)+'_'+str(_to)+'.tsv'):
    print('done')
    exit()

distance_jaro = pd.read_csv('/home/roregu/workspace/aux.tsv', sep = '\t', usecols=[0]+list(np.arange(_from,_to)), index_col=0 )

#print(distance_jaro.head())

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    #if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

#print(concatenate)
concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
#print(concatenate)
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[(concatenate.score >= THRESHOLD) & ~concatenate.original.isna()].original.unique())].to_csv('/home/roregu/workspace/tmp/concatenate_'+str(_from)+'_'+str(_to)+'.tsv', index=False, sep='\t')

print('done_'+str(_from)+'_'+str(_to))

exit()

           kjøbenhavn
kronborg         0.50
præstø           0.42
bornholms        0.54
svendborg        0.43
ålborg           0.42
[            score    original
københavn    0.97  kjøbenhavn
københavns   0.94  kjøbenhavn
købelev      0.76  kjøbenhavn
kalvehave    0.72  kjøbenhavn
køge         0.71  kjøbenhavn]
  potential_match  score    original
0       københavn   0.97  kjøbenhavn
1      københavns   0.94  kjøbenhavn
2         købelev   0.76  kjøbenhavn
3       kalvehave   0.72  kjøbenhavn
4            køge   0.71  kjøbenhavn


Unnamed: 0,original,potential_match,score


In [None]:
0/0

In [5]:
#Listing all files in directory 
concatenate = []
for directory, _, files in os.walk('/home/roregu/workspace/tmp/'):
    for file in files:
        concatenate.append(pd.read_csv(directory+file, sep = '\t', dtype=str))
                           
concatenate = pd.concat(concatenate, sort=False)
concatenate.head()

Unnamed: 0,original,potential_match,score
0,"erøe, tranderup sogn",rø,0.7
1,"erøe, tranderup sogn",nørre asmindrup,0.7
2,"erøe, tranderup sogn",nørre sandager,0.69
3,"erøe, tranderup sogn",sønder tranders,0.69
4,"erøe, tranderup sogn",nørre rangstrup,0.69


In [11]:
# Getting the digDag info for the matches but also for the non matches

m = concatenate.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename',keep='first'), left_on='potential_match', right_on='simplename', how='inner')
m = m[['original','potential_match','score','navn','art','enhedid', 'simplename']]

w = dd2.sort_values('art', ascending=False).drop_duplicates('simplename',keep='first')
w['potential_match'] = w['simplename']
w['original'] = w['simplename']
w['score'] = 1

m = pd.concat([w, m], sort=True)


#Putting the things into place
m['original'] = np.where(m.original.isna(), m['simplename'], m['original'])

print(len(concatenate), len(m))
m.head()

2801435 99


Unnamed: 0.1,Unnamed: 0,art,enhedid,enhedtype,findes i digdag,navn,original,potential_match,score,simplename
0,1,Købstad,120697,Købstad,0,Assens Købstad,assens,assens,1,assens
74,75,Købstad,-1,Købstad,1,Slangerup Købstad,slangerup,slangerup,1,slangerup
72,73,Købstad,120677,Købstad,0,Skælskør Købstad,skælskør,skælskør,1,skælskør
71,72,Købstad,120676,Købstad,0,Skjern Købstad,skjern,skjern,1,skjern
70,71,Købstad,120720,Købstad,0,Skive Købstad,skive,skive,1,skive


In [25]:
#Getting the counts for the original plus the counts for the potential matches in DigDag
#Warning: Potential matches may not exist in the data...
out = m.merge(out_list, left_on='original', right_on='fødested_data', how='left').merge(out_list, left_on='potential_match', right_on=focus_column+'_data', how='left').fillna(0)
for y in ['1845', '1850', '1860', '1880', '1885', '1901']:
    out[y] = out[y+'_x'] + out[y+'_y']
out = out[['original', 'potential_match', 'score', 'navn', 'art', 'enhedid', '1845', '1850', '1860', '1880', '1885', '1901']].sort_values('original')
out.to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', sep='\t', index=False)

In [26]:
print('Done! :D')

Done! :D


In [24]:
out_list

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total
281450,kjøbenhavn,46942,44015,85495,75245,134653,22607,408957
298127,københavn,21034,9335,16230,68462,15581,40997,171639
369718,odense,8357,6530,9279,13408,3020,22562,63156
251691,i sognet,15795,26408,12990,4689,0,67,59949
21199,aarhus,4063,4594,1710,12183,1859,22638,47047
...,...,...,...,...,...,...,...,...
245096,høien vejle,0,0,0,1,0,0,1
245095,høien veile amt??,0,1,0,0,0,0,1
245092,høien veile,0,0,1,0,0,0,1
245091,høien v a,0,0,0,0,0,1,1
