In [1]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance
import Levenshtein

In [2]:
#TODO: Check if Danish computers have the same problem...
I_want_to_run_all_the_preprocessing = False

# Converting the current format (ISO-8859-14) into UTF-8 
iconv -f ISO-8859-14 file_in.csv > file_out.tsv

In [3]:

if I_want_to_run_all_the_preprocessing:
    
    #Copying into a new directory
    ! cp -R ../../city_dump/data/LL/ ../../city_dump/data/utf8/

    #Getting all files
    d_path = '../../city_dump/data/utf8/'
    l = next(os.walk(d_path))[2]

    #Converting the files to utf8 (not sure this will work in computers different than mac...)
    for file in l:
        f_path = d_path+file
        tmp_path = d_path+'tmp.csv'
        ! iconv -f ISO-8859-14 $f_path > $tmp_path
        !mv $tmp_path $f_path

# Reading DigDag
and transforming it so I encode as many logical variation of the names as possible. In this way the variations for each place is only appied once whereas if we do the same for the data it will need to be applied to each place taking much longer. But the way to remember the original variation is to keep the "enhedid", which is the reference id.

In [4]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [5]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv('../../city_dump/data/rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv('../../city_dump/data/koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)


#Moving the Købstad to the right column
dd['art'] = np.where(dd['enhedtype'] == 'Købstad', ['Købstadskommune']*len(dd), dd['art'])

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn and landsogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')
dd = adding_extra_rows(dd, 'sogn', 'landsogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

#Drop 'Non' from the list, this may be one of my artifacts...
dd = dd[(dd.simplename.apply(lambda x: not 'Non' in x))]

# TODO take into consideration that the same city name can be in varios geographical areas...
# It should be matched with herred to increase accuracy for each record. However, right now we're only cleaning
# not sure how pressing the issue is...
nr_sogn = len(dd_org[dd_org.art=='Sogn'].simplename.unique()) 
dd.head(20)

#TODO: Right now I can map things to amt for example which should not be the case...
#dd =dd[dd.art='sogn']

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


In [6]:
print('length original:\t', len(dd_org), '\nlength modified:\t', len(dd), '\nunique sogn keys:\t', nr_sogn)

length original:	 2519 
length modified:	 27097 
unique sogn keys:	 1924


# Processing functions

In [7]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [8]:
# Name cleaning function
def name_cleaner(s):
    try:
        o =s[working_column].lower().rstrip().replace('  ', ' ')#.replace(' købstad', '')
        if ' (' in o: return o.split(' (')[0]
        return o
    except:
        return np.nan

# Grouping all 1901 files together

In [9]:
if I_want_to_run_all_the_preprocessing:
    
    l =['ft1901_LL_aalborg.txt',
     'ft1901_LL_aarhus.txt',
     'ft1901_LL_bornholm.txt',
     'ft1901_LL_frederiksborg.txt',
     'ft1901_LL_hjoerring.txt',
     'ft1901_LL_holbaek.txt',
     'ft1901_LL_kbhv.txt',
     'ft1901_LL_maribo.txt',
     'ft1901_LL_odense.txt',
     'ft1901_LL_praestoe.txt',
     'ft1901_LL_randers.txt',
     'ft1901_LL_ribe.txt',
     'ft1901_LL_ringkoebing.txt',
     'ft1901_LL_roskilde.txt',
     'ft1901_LL_skanderborg.txt',
     'ft1901_LL_soroe.txt',
     'ft1901_LL_svendborg.txt',
     'ft1901_LL_thisted.txt',
     'ft1901_LL_vejle.txt',
     'ft1901_LL_viborg.txt']

    _path = '../../city_dump/data/utf8/'

    df_list = []
    for f in l:

        print(f)
        #Loading the data
        df_list.append(get_df(_path+f))

    #Concatenating all the files
    df_list = pd.concat(df_list, sort=False)

    #Saving the unique file
    df_list.to_csv(_path+'ft1901_LL.txt', sep='$', index=False)

In [10]:
# List of files to work (with the 1901 collapsed)
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

# Basic matching
(only performs the match if the names are identical)

_This could be done only once by countring reference id and year. Then with the names we can keep constraining with matches_

In [11]:
_path = '../../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []
total_records = 0

for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing (ask Barbara to provide me the UTF-8...)
    df = df[~df.Herred.isna()]
    print(len(df))
    total_records = total_records + len(df)
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)

    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='sogn'][['simplename', 'art', 'enhedid']], left_on='Sogne', right_on='simplename', how='left')

    #keeping the missmatch
    miss = out[out.art.isna()].drop_duplicates('Sogne', keep='first')
    miss['year'] = f[2:6]
    missmatches.append(miss)
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('enhedid').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)
    

#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)
missmatches = pd.concat(missmatches, sort=False)

#Saving the unique file in the HD
out_list = out_list.merge(dd_org, on ='enhedid').pivot('enhedid', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts.txt', sep='$')
out_list.to_csv(_path+'../out/places_FT_uniquevalues_01.tsv', sep='\t')

print('Done :D')

nr_matched_records = out_list.sum().sum() #Two times sum, first one for year, and second one among years
print('Matched records:',nr_matched_records, 'out of', total_records, '(', nr_matched_records/total_records*100,')%')
print('Matched places:', len(out_list), 'out of', nr_sogn, '(', len(out_list)/nr_sogn*100,')%')


ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025
Done :D
Matched records: 6256417 out of 8872065 ( 70.51816009012558 )%
Matched places: 1577 out of 1924 ( 81.96465696465697 )%


In [12]:
#Cities that could not be matched on the first round counts per year
missmatches.groupby('year').size().reset_index(name='counts').sort_values('counts', ascending=False)

Unnamed: 0,year,counts
5,1901,592
4,1885,556
3,1880,553
0,1845,238
2,1860,159
1,1850,155


# Computing city similarity for all missmatches

In [13]:
#Getting the potential first matches

mm = missmatches.Sogne.unique()
sn = dd.simplename.unique() #Another possibility wowuld be to match it to the original (shorter)

distance_jaro = np.zeros((len(sn),len(mm)))
distance_leven = np.zeros((len(sn),len(mm)))

for i in range(len(sn)):
    if i%100 == 0: print(i, 'out of' , len(sn))
        
    for j in range(len(mm)): #The internal loop could be done using apply (it should be the longer one)
        #If variable is none skip
        if not mm[j]: continue
        
        #This matrix is not simetric because x and y axis are not the same!
        try:
            distance_jaro[i][j] = distance.get_jaro_distance(sn[i],mm[j])
            distance_leven[i][j] = Levenshtein.distance(sn[i],mm[j])/(len(sn[i])+len(mm[j]))
        except:
            print('Could not make it',i,j)
            print(sn[i],mm[j])
        
#Index is the reference names found in the data, columns to the digdag
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
distance_leven = pd.DataFrame(data = distance_leven, columns = mm, index = sn)
print(distance_jaro.shape) #(26428, 1237)
distance_jaro.head()

0 out of 26784
100 out of 26784
200 out of 26784
300 out of 26784
400 out of 26784
500 out of 26784
600 out of 26784
700 out of 26784
800 out of 26784
900 out of 26784
1000 out of 26784
1100 out of 26784
1200 out of 26784
1300 out of 26784
1400 out of 26784
1500 out of 26784
1600 out of 26784
1700 out of 26784
1800 out of 26784
1900 out of 26784
2000 out of 26784
2100 out of 26784
2200 out of 26784
2300 out of 26784
2400 out of 26784
2500 out of 26784
2600 out of 26784
2700 out of 26784
2800 out of 26784
2900 out of 26784
3000 out of 26784
3100 out of 26784
3200 out of 26784
3300 out of 26784
3400 out of 26784
3500 out of 26784
3600 out of 26784
3700 out of 26784
3800 out of 26784
3900 out of 26784
4000 out of 26784
4100 out of 26784
4200 out of 26784
4300 out of 26784
4400 out of 26784
4500 out of 26784
4600 out of 26784
4700 out of 26784
4800 out of 26784
4900 out of 26784
5000 out of 26784
5100 out of 26784
5200 out of 26784
5300 out of 26784
5400 out of 26784
5500 out of 26784
5600

Unnamed: 0,nørre bramdrup,nørresundby landsogn,slagelse købstad,stillinge,allinge-sandvig købstæder,rønne købstad,aakirkeby købstad,hasle købstad,svaneke købstad,neksø købstad,...,bernhardsvej,baunegaardsvej,bengtasvej,bogholder alle,boyesgade,broagergade,birkegade,rugaards landevej,esbjerg købstad,silkeborg købstad
kronborg,0.43,0.4,0.4,0.49,0.44,0.54,0.52,0.4,0.4,0.42,...,0.53,0.43,0.45,0.51,0.46,0.56,0.56,0.46,0.41,0.57
præstø,0.41,0.48,0.41,0.52,0.0,0.5,0.41,0.5,0.41,0.5,...,0.42,0.0,0.34,0.0,0.43,0.42,0.43,0.41,0.41,0.41
bornholms,0.4,0.38,0.45,0.31,0.37,0.52,0.39,0.52,0.45,0.46,...,0.69,0.56,0.59,0.71,0.64,0.48,0.53,0.39,0.51,0.28
svendborg,0.41,0.47,0.5,0.6,0.37,0.41,0.39,0.52,0.59,0.5,...,0.51,0.41,0.47,0.4,0.46,0.44,0.44,0.28,0.56,0.71
ålborg,0.41,0.41,0.49,0.52,0.47,0.0,0.41,0.41,0.0,0.0,...,0.5,0.49,0.51,0.57,0.61,0.59,0.61,0.32,0.57,0.71


In [14]:
dd_org[dd_org.simplename == 'stilling'] #Stilling in dd_org!!

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
1515,Stilling Sogn,114713,76,Sogn,stilling


In [15]:
'stilling' in distance_jaro.index

True

In [16]:
#Saving the best possible match (left=name found in the data, right=name found in the extended version of digdag)
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
c = a.merge(b, on='index').rename(columns={'index':'data_name'})
c.to_csv(_path+'../out/mapping.tsv', sep='\t')
c.head()

Unnamed: 0,data_name,potential_match,jaro
0,nørre bramdrup,nørre asmindrup,0.9
1,nørresundby landsogn,nørresnedebylandsogn,0.96
2,slagelse købstad,slagelse købstad,1.0
3,stillinge,stilling,0.98
4,allinge-sandvig købstæder,allinge-sandvig købstæder,1.0


# Advanced matching
(This is the same as the basic matching but in this case it matches to the computed best match)

In [17]:
#Select threshold to perform the matching, each mapping contains only the best jaro
THRESHOLD = 0.9

dfmap = pd.read_csv(_path+'../out/mapping.tsv', sep = '\t', dtype=str, names=['original','mapped', 'score'], skiprows=1)
dfmap = dfmap[dfmap.score.astype(float) >= THRESHOLD][['original','mapped']]
dfmap.head()

Unnamed: 0,original,mapped
0,nørre bramdrup,nørre asmindrup
1,nørresundby landsogn,nørresnedebylandsogn
2,slagelse købstad,slagelse købstad
3,stillinge,stilling
4,allinge-sandvig købstæder,allinge-sandvig købstæder


In [18]:
_path = '../../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []


for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)
    
    #Performing the matching for "inexisting places"
    df = df.merge(dfmap, left_on='Sogne', right_on='original', how='left')
    df.loc[~df.mapped.isna(), 'Sogne'] = df.loc[~df.mapped.isna(), 'mapped']


    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='sogn'][['simplename', 'art','enhedid']], left_on='Sogne', right_on='simplename', how='left')
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('enhedid').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)

    
#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)


#Saving the unique file in the HD
out_list = out_list.merge(dd_org, on ='enhedid').pivot('simplename', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts_first_jaro.txt', sep='$')
out_list.to_csv(_path+'../out/places_FT_jaro0.9_01.tsv', sep='\t')


print('Done :D')

nr_matched_records = out_list.sum().sum() #Two times sum, first one for year, and second one among years
print('Matched records:',nr_matched_records, 'out of', total_records, '(', nr_matched_records/total_records*100,')%')
print('Matched places:', len(out_list), 'out of', nr_sogn, '(', len(out_list)/nr_sogn*100,')%')


ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025
Done :D
Matched records: 6674325 out of 8872065 ( 75.22854036799775 )%
Matched places: 1647 out of 1924 ( 85.6029106029106 )%


In [19]:
#Adding few more requested variables
dfmap = pd.read_csv(_path+'../out/mapping.tsv', sep = '\t', dtype=str, names=['original','mapped', 'score'], skiprows=1)
jaro09 = pd.read_csv(_path+'../out/places_FT_jaro0.9_01.tsv', sep='\t')
jaro09 = jaro09.merge(dfmap[dfmap.score.astype(float) >= THRESHOLD], left_on='simplename', right_on='mapped').merge(dd[['simplename', 'art', 'enhedid']], on='simplename')
jaro09.to_csv(_path+'../out/places_FT_jaro0.9_01.tsv', sep='\t')
jaro09.head()

Unnamed: 0,simplename,1845,1850,1860,1880,1885,1901,original,mapped,score,art,enhedid
0,absalons,0,0,0,4022,4065,4477,absalonsgade,absalons,0.93,sogn,113935
1,aggersborg,516,513,580,752,0,1222,aggersborggade,aggersborg,0.94,sogn,113943
2,allerslev,1784,1881,1992,2444,475,2249,allersgade,allerslev,0.9,sogn,113953
3,anna,0,0,0,0,0,127,annasvej,anna,0.9,sogn,113974
4,astrup,2667,2132,3071,2903,0,2947,magstrup,astrup,0.92,sogn,113997


In [20]:
# Finding the best possible matches

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[concatenate.score >= 0.9].original.unique())].to_csv(_path+'../out/places_possible_matches_jaro_01.tsv', index=False, sep='\t')

concatenate = pd.read_csv(_path+'../out/places_possible_matches_jaro_01.tsv',  sep='\t')
concatenate.head(15)

100 out of 1237
200 out of 1237
300 out of 1237
400 out of 1237
500 out of 1237
600 out of 1237
700 out of 1237
800 out of 1237
900 out of 1237
1000 out of 1237
1100 out of 1237
1200 out of 1237


Unnamed: 0,original,potential_match,score
0,hofetaten,hoejen,0.84
1,hofetaten,ho,0.79
2,hofetaten,soeften,0.79
3,hofetaten,hoersted,0.78
4,hofetaten,hoerstedsogn,0.78
5,ude sundby landdistrikt,udbyneder landsogn,0.77
6,ude sundby landdistrikt,udbynederlandsogn,0.75
7,ude sundby landdistrikt,understed bylandsogn,0.75
8,ude sundby landdistrikt,udbyneder bylandsogn,0.75
9,ude sundby landdistrikt,sædden bylandsogn,0.73


# Getting the counts for all possible matches (reading only once)

In [21]:
_path = '../../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []


for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)
    
    df['year'] = f[2:6]
    
    #TODO: Do the counting inside so it removes overhead
    
    #Keeping the counts on RAM
    out_list.append(df)

    
#Concatenating all the files (the counts and missmatches)
df = pd.concat(out_list, sort=False)
df.head()

ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025


Unnamed: 0,Sogne,Herred,Amt,postid,kipnr,kilde,løbenr,kildehenvisning,stednavn,husstands_familienr,...,gadenr,etage,forhus,skemanr,skema_lbnr,kildekommentar,ejendom_navn,ejernavn,bo_andetsteds,tilstede
0,harte,brusk,vejle,1,A0003,FT-1845,100000,side 1,Paaby,1,...,,,,,,,,,,
1,harte,brusk,vejle,2,A0003,FT-1845,200000,side 1,Paaby,1,...,,,,,,,,,,
2,harte,brusk,vejle,3,A0003,FT-1845,300000,side 1,Paaby,1,...,,,,,,,,,,
3,harte,brusk,vejle,4,A0003,FT-1845,400000,side 1,Paaby,1,...,,,,,,,,,,
4,harte,brusk,vejle,5,A0003,FT-1845,500000,side 1,Paaby,1,...,,,,,,,,,,


In [22]:
#Read everything and now do the counting outside (not the best... would be better to count inside the loop)
dfc = df.groupby(['Sogne','year']).size().reset_index(name='counts')
dfc.head()

Unnamed: 0,Sogne,year,counts
0,,1845,1
1,(h. c.) andersensvej,1880,744
2,a. f. beyersvej,1901,78
3,a. n. hansens allé,1901,10
4,aabenraa,1880,1404


In [23]:
THRESHOLD = 0.9

#One row per city
a = dfc.rename(columns={'year':'id'}).pivot(index='Sogne', columns='id', values='counts').reset_index().fillna(0).astype(int, errors='ignore')

#Performing the matching for mapped places
dfmap = pd.read_csv(_path+'../out/mapping.tsv', sep = '\t', dtype=str, names=['original','mapped', 'score'], skiprows=1)
dfmap =  dfmap[dfmap.score.astype(float) >= THRESHOLD]

df = a.merge(dfmap, left_on='Sogne', right_on='original', how='left')
df.loc[~df.mapped.isna(), 'Sogne'] = df.loc[~df.mapped.isna(), 'mapped']

# Focusing on Sogn that match the Sogn list in DigDag
df = df.merge(dd[dd.art=='sogn'][['simplename', 'art','enhedid']], left_on='Sogne', right_on='simplename', how='left')
df.loc[df.simplename.isna() & ~df.mapped.isna(), 'simplename'] = df.loc[df.simplename.isna() & ~df.mapped.isna(), 'mapped']
df.loc[~df.simplename.isna(), 'Sogne'] = df.loc[~df.simplename.isna(), 'simplename']

df.head(20)

Unnamed: 0,Sogne,1845,1850,1860,1880,1885,1901,original,mapped,score,simplename,art,enhedid
0,,1,0,0,0,0,0,,,,,,
1,(h. c.) andersensvej,0,0,0,744,0,0,,,,,,
2,a. f. beyersvej,0,0,0,0,0,78,,,,,,
3,a. n. hansens allé,0,0,0,0,0,10,,,,,,
4,aabenraa,0,0,0,1404,1314,1044,,,,aabenraa,sogn,113932.0
5,aabenraa købstad,0,0,5139,0,0,0,aabenraa købstad,aabenraa købstad,1.0,aabenraa købstad,,
6,aabenraa landsogn,444,0,650,0,0,0,,,,aabenraa landsogn,sogn,113932.0
7,aaby,1520,1593,2008,2606,0,3075,,,,aaby,sogn,115242.0
8,aadum,554,603,712,979,0,1031,,,,,,
9,aagade,0,0,0,915,1058,1034,,,,,,


In [27]:
df = df.groupby(['Sogne'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

Unnamed: 0,Sogne,1845,1850,1860,1880,1885,1901
0,,1,0,0,0,0,0
1,(h. c.) andersensvej,0,0,0,744,0,0
2,a. f. beyersvej,0,0,0,0,0,78
3,a. n. hansens allé,0,0,0,0,0,10
4,aabenraa,0,0,0,1404,1314,1044
5,aabenraa købstad,0,0,5139,0,0,0
6,aabenraa landsogn,444,0,650,0,0,0
7,aaby,2803,3002,3418,3992,0,4213
8,aadum,554,603,712,979,0,1031
9,aagade,0,0,0,915,1058,1034


In [60]:
z = concatenate.merge(df, left_on='original', right_on='Sogne').merge(dd, left_on='potential_match', right_on='simplename').merge(df, left_on='enhedid_y', right_on='enhedid')
for y in ['1845', '1850', '1860', '1880', '1885', '1901']:
    z[y] = z[y+'_x'] + z[y+'_y']
    
z['original'] = z['original_x']
z['score'] = z['score_x']
z = z[['original','potential_match', 'score', '1845', '1850', '1860', '1880', '1885', '1901', 'navn', 'art', 'enhedid']]

z.to_csv(_path+'../out/places_possible_matches_jaro_01.tsv', sep='\t')

In [61]:
z

Unnamed: 0,original,potential_match,score,1845,1850,1860,1880,1885,1901,navn,art,enhedid
0,hofetaten,hoejen,0.84,1516,829,950,909,0,764,Højen Sogn,sogn,113648
1,højdevej,højen,0.84,816,829,950,909,42,808,Højen Sogn,sogn,113648
2,høyensgade,højensogn,0.80,816,829,950,909,138,764,Højen Sogn,sogn,113648
3,hofetaten,ho,0.79,996,286,303,311,0,326,Ho Sogn,sogn,113563
4,hospitalsvej,ho,0.78,296,286,303,311,49,326,Ho Sogn,sogn,113563
5,hofetaten,soeften,0.79,1230,535,593,506,0,559,Søften Sogn,sogn,114779
6,søetaten,soeften,0.80,8829,535,593,506,0,559,Søften Sogn,sogn,114779
7,sofievej,soeften,0.77,530,535,593,801,796,559,Søften Sogn,sogn,114779
8,søetaten,søften,0.80,8829,535,593,506,0,559,Søften Sogn,sogn,114779
9,søfortet lynetten,søften,0.80,530,535,593,514,0,559,Søften Sogn,sogn,114779
