In [45]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance
import Levenshtein

In [46]:
I_want_to_run_all_the_preprocessing = False

# Converting the current format (ISO-8859-14) into UTF-8 
iconv -f ISO-8859-14 file_in.csv > file_out.tsv

In [47]:

if I_want_to_run_all_the_preprocessing:
    
    #Copying into a new directory
    ! cp -R ../city_dump/data/LL/ ../city_dump/data/utf8/

    #Getting all files
    d_path = '../city_dump/data/utf8/'
    l = next(os.walk(d_path))[2]

    #Converting the files to utf8
    for file in l:
        f_path = d_path+file
        tmp_path = d_path+'tmp.csv'
        ! iconv -f ISO-8859-14 $f_path > $tmp_path
        !mv $tmp_path $f_path

# Reading DigDag
(and transforming it so I encode as many logical variation of the names as possible)

In [48]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [49]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv('../city_dump/data/rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv('../city_dump/data/koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

dd.head(20)

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


# Processing functions

In [50]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [51]:
# Name cleaning function
def name_cleaner(s):
    try:
        o =s[working_column].lower().rstrip().replace('  ', ' ')#.replace(' købstad', '')
        if ' (' in o: return o.split(' (')[0]
        return o
    except:
        return np.nan

# Grouping all 1901 files together

In [52]:
if I_want_to_run_all_the_preprocessing:
    
    l =['ft1901_LL_aalborg.txt',
     'ft1901_LL_aarhus.txt',
     'ft1901_LL_bornholm.txt',
     'ft1901_LL_frederiksborg.txt',
     'ft1901_LL_hjoerring.txt',
     'ft1901_LL_holbaek.txt',
     'ft1901_LL_kbhv.txt',
     'ft1901_LL_maribo.txt',
     'ft1901_LL_odense.txt',
     'ft1901_LL_praestoe.txt',
     'ft1901_LL_randers.txt',
     'ft1901_LL_ribe.txt',
     'ft1901_LL_ringkoebing.txt',
     'ft1901_LL_roskilde.txt',
     'ft1901_LL_skanderborg.txt',
     'ft1901_LL_soroe.txt',
     'ft1901_LL_svendborg.txt',
     'ft1901_LL_thisted.txt',
     'ft1901_LL_vejle.txt',
     'ft1901_LL_viborg.txt']

    _path = '../city_dump/data/utf8/'

    df_list = []
    for f in l:

        print(f)
        #Loading the data
        df_list.append(get_df(_path+f))

    #Concatenating all the files
    df_list = pd.concat(df_list, sort=False)

    #Saving the unique file
    df_list.to_csv(_path+'ft1901_LL.txt', sep='$', index=False)

In [53]:
# List of files to work (with the 1901 collapsed)
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

# Basic matching
(only performs the match if the names are identical)

In [54]:
_path = '../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []

for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)

    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='sogn'][['simplename', 'art', 'enhedid']], left_on='Sogne', right_on='simplename', how='left')

    #keeping the missmatch
    miss = out[out.art.isna()].drop_duplicates('Sogne', keep='first')
    miss['year'] = f[2:6]
    missmatches.append(miss)
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('enhedid').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)
    
#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)
missmatches = pd.concat(missmatches, sort=False)

#Saving the unique file in the HD
out_list = out_list.merge(dd_org, on ='enhedid').pivot('enhedid', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts.txt', sep='$')
out_list.to_csv(_path+'../out/counts.tsv', sep='\t')
print('Done! :)')

ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025
Done! :)


In [55]:
#Cities that could not be matched on the first round counts per year
missmatches.groupby('year').size().reset_index(name='counts').sort_values('counts', ascending=False)

Unnamed: 0,year,counts
5,1901,612
3,1880,572
4,1885,556
0,1845,257
2,1860,183
1,1850,173


# Computing city similarity for missmatches

In [56]:
#Getting the potential first matches

mm = missmatches.Sogne.unique()
sn = dd.simplename.unique()

distance_jaro = np.zeros((len(sn),len(mm)))
distance_leven = np.zeros((len(sn),len(mm)))

for i in range(len(sn)):
    if i%100 == 0: print(i, 'out of' ,len(sn))
        
    for j in range(len(mm)):
        #If variable is none skip
        if not mm[j]: continue
        
        #This matrix is not simetric because x and y axis are not the same!
        try:
            distance_jaro[i][j] = distance.get_jaro_distance(sn[i],mm[j])
            distance_leven[i][j] = Levenshtein.distance(sn[i],mm[j])/(len(sn[i])+len(mm[j]))
        except:
            print('Could not make it',i,j)
            print(sn[i],mm[j])
        
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
distance_leven = pd.DataFrame(data = distance_leven, columns = mm, index = sn)
distance_leven.head()

0 out of 16440
100 out of 16440
200 out of 16440
300 out of 16440
400 out of 16440
500 out of 16440
600 out of 16440
700 out of 16440
800 out of 16440
900 out of 16440
1000 out of 16440
1100 out of 16440
1200 out of 16440
1300 out of 16440
1400 out of 16440
1500 out of 16440
1600 out of 16440
1700 out of 16440
1800 out of 16440
1900 out of 16440
2000 out of 16440
2100 out of 16440
2200 out of 16440
2300 out of 16440
2400 out of 16440
2500 out of 16440
2600 out of 16440
2700 out of 16440
2800 out of 16440
2900 out of 16440
3000 out of 16440
3100 out of 16440
3200 out of 16440
3300 out of 16440
3400 out of 16440
3500 out of 16440
3600 out of 16440
3700 out of 16440
3800 out of 16440
3900 out of 16440
4000 out of 16440
4100 out of 16440
4200 out of 16440
4300 out of 16440
4400 out of 16440
4500 out of 16440
4600 out of 16440
4700 out of 16440
4800 out of 16440
4900 out of 16440
5000 out of 16440
5100 out of 16440
5200 out of 16440
5300 out of 16440
5400 out of 16440
5500 out of 16440
5600

Unnamed: 0,nørre bramdrup,nørresundby landsogn,slagelse købstad,sankt mikkels landsogn,sankt peders landsogn,stillinge,allinge-sandvig købstæder,rønne købstad,aakirkeby købstad,hasle købstad,...,baunegaardsvej,bengtasvej,bogholder alle,boyesgade,broagergade,birkegade,rugaards landevej,esbjerg købstad,silkeborg landsogn,silkeborg købstad
kronborg,0.5,0.571429,0.625,0.633333,0.586207,0.470588,0.69697,0.52381,0.56,0.571429,...,0.545455,0.555556,0.545455,0.529412,0.368421,0.470588,0.6,0.565217,0.538462,0.52
præstø,0.65,0.692308,0.636364,0.75,0.666667,0.6,0.741935,0.578947,0.608696,0.578947,...,0.6,0.5625,0.65,0.533333,0.588235,0.533333,0.652174,0.571429,0.666667,0.608696
bornholms,0.521739,0.551724,0.6,0.645161,0.633333,0.5,0.676471,0.5,0.576923,0.545455,...,0.478261,0.473684,0.434783,0.388889,0.45,0.388889,0.576923,0.5,0.518519,0.5
svendborg,0.521739,0.517241,0.52,0.548387,0.533333,0.444444,0.617647,0.5,0.576923,0.454545,...,0.521739,0.473684,0.565217,0.5,0.5,0.5,0.538462,0.5,0.481481,0.461538
ålborg,0.6,0.692308,0.636364,0.714286,0.703704,0.466667,0.741935,0.631579,0.695652,0.578947,...,0.65,0.625,0.6,0.533333,0.529412,0.533333,0.695652,0.571429,0.541667,0.521739


In [57]:
#Saving a mapping
distance_jaro.idxmax().to_csv(_path+'../out/mapping.tsv', sep='\t')

# Advanced matching
(This is the same as the basic matching but in this case it matches to the computed best match)

In [58]:
dfmap = pd.read_csv(_path+'../out/mapping.tsv', sep = '\t', dtype=str, names=['original','mapped'])
dfmap.head()

Unnamed: 0,original,mapped
0,nørre bramdrup,nørre asmindrup
1,nørresundby landsogn,nørresundby Non
2,slagelse købstad,slagelse købstad
3,sankt mikkels landsogn,sankt mikkels sogn
4,sankt peders landsogn,sankt peders sogn


In [59]:
_path = '../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []


for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)
    
    #Performing the matching for "inexisting places"
    df = df.merge(dfmap, left_on='Sogne', right_on='original', how='left')
    df.loc[~df.mapped.isna(), 'Sogne'] = df.loc[~df.mapped.isna(), 'mapped']


    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='sogn'][['simplename', 'art','enhedid']], left_on='Sogne', right_on='simplename', how='left')
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('enhedid').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)

    
#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)


#Saving the unique file in the HD
out_list = out_list.merge(dd_org, on ='enhedid').pivot('simplename', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts_first_jaro.txt', sep='$')
out_list.to_csv(_path+'../out/counts_first_jaro.tsv', sep='\t')
print('Done! :)')

ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025
Done! :)


In [60]:
# Creating a list so they can evaluate the matching correctness

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[concatenate.score == 1].original.unique())].to_csv(_path+'../out/possible_matches.tsv', index=False, sep='\t')
concatenate.head(15)

100 out of 1264
200 out of 1264
300 out of 1264
400 out of 1264
500 out of 1264
600 out of 1264
700 out of 1264
800 out of 1264
900 out of 1264
1000 out of 1264
1100 out of 1264
1200 out of 1264


Unnamed: 0,potential_match,score,original
0,nørre asmindrup,0.9,nørre bramdrup
1,nørre rangstrup,0.9,nørre bramdrup
2,nørre tranders,0.89,nørre bramdrup
3,nørre tyrstrup,0.89,nørre bramdrup
4,nørrerangstrup,0.89,nørre bramdrup
5,nørresundby Non,0.94,nørresundby landsogn
6,nørresundbyNon,0.94,nørresundby landsogn
7,nørresundby None,0.93,nørresundby landsogn
8,nørresundbyNone,0.93,nørresundby landsogn
9,nørresundby,0.92,nørresundby landsogn


In [61]:
#0/0

In [62]:
print('Perfect match:', len(concatenate[concatenate.score == 1].original.unique()))
print('To match:', int(len(concatenate[~concatenate.original.isin(concatenate[concatenate.score == 1].original.unique())].sort_values('score', ascending=False))/5))

'''
Perfect match: 1630
To match: 1189
''';

Perfect match: 79
To match: 1185


In [63]:
concatenate[~concatenate.original.isin(concatenate[concatenate.score == 1].original.unique())].sort_values('score', ascending=False)

Unnamed: 0,potential_match,score,original
25,stilling,0.98,stillinge
715,egeslevmagle,0.98,eggeslevmagle
150,østerbølle,0.98,øster bølle
255,skibbinge,0.98,skibinge
145,vesterbølle,0.98,vester bølle
5155,sønder alslev,0.97,sønder alle
195,thorstrup,0.97,torstrup
565,fjelsø,0.97,fjeldsø
146,vesterbøll,0.97,vester bølle
5075,nørre alslev,0.97,nørre alle
