In [40]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance
import Levenshtein

In [41]:
I_want_to_run_all_the_preprocessing = False

# Converting the current format (ISO-8859-14) into UTF-8 
iconv -f ISO-8859-14 file_in.csv > file_out.tsv

In [42]:

if I_want_to_run_all_the_preprocessing:
    
    #Copying into a new directory
    ! cp -R ../city_dump/data/LL/ ../city_dump/data/utf8/

    #Getting all files
    d_path = '../city_dump/data/utf8/'
    l = next(os.walk(d_path))[2]

    #Converting the files to utf8
    for file in l:
        f_path = d_path+file
        tmp_path = d_path+'tmp.csv'
        ! iconv -f ISO-8859-14 $f_path > $tmp_path
        !mv $tmp_path $f_path

# Reading DigDag
(and transforming it so I encode as many logical variation of the names as possible)

In [43]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv('../city_dump/data/rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)

dd['simplename'] = dd['simplename'].astype(str)

#adding a conversion of special characters to latin letters (å -> aa, å -> oe, æ -> ae) and adding them to the reference list
dd['special'] = dd['simplename'].apply(lambda x: 'ø' in x) | dd['simplename'].apply(lambda x: 'å' in x) | dd['simplename'].apply(lambda x: 'æ' in x)
dd['simplename2'] = dd.simplename.apply(lambda x: x.replace('å','aa').replace('ø','oe').replace('æ','ae'))
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Preparing the names to append them at the simple name
dd['art'] =dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: row['simplename']+' '+ row['art'], axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])


#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

# Processing functions

In [44]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [45]:
# Name cleaning function
def name_cleaner(s):
    try:
        o =s[working_column].lower().rstrip().replace('  ', ' ')#.replace(' købstad', '')
        if ' (' in o: return o.split(' (')[0]
        return o
    except:
        return np.nan

# Grouping all 1901 files together

In [46]:
if I_want_to_run_all_the_preprocessing:
    
    l =['ft1901_LL_aalborg.txt',
     'ft1901_LL_aarhus.txt',
     'ft1901_LL_bornholm.txt',
     'ft1901_LL_frederiksborg.txt',
     'ft1901_LL_hjoerring.txt',
     'ft1901_LL_holbaek.txt',
     'ft1901_LL_kbhv.txt',
     'ft1901_LL_maribo.txt',
     'ft1901_LL_odense.txt',
     'ft1901_LL_praestoe.txt',
     'ft1901_LL_randers.txt',
     'ft1901_LL_ribe.txt',
     'ft1901_LL_ringkoebing.txt',
     'ft1901_LL_roskilde.txt',
     'ft1901_LL_skanderborg.txt',
     'ft1901_LL_soroe.txt',
     'ft1901_LL_svendborg.txt',
     'ft1901_LL_thisted.txt',
     'ft1901_LL_vejle.txt',
     'ft1901_LL_viborg.txt']

    _path = '../city_dump/data/utf8/'

    df_list = []
    for f in l:

        print(f)
        #Loading the data
        df_list.append(get_df(_path+f))

    #Concatenating all the files
    df_list = pd.concat(df_list, sort=False)

    #Saving the unique file
    df_list.to_csv(_path+'ft1901_LL.txt', sep='$', index=False)

In [47]:
# List of files to work (with the 1901 collapsed)
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

# Basic matching
(only performs the match if the names are identical)

In [48]:
_path = '../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []

for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)

    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='Sogn'][['simplename', 'art']], left_on='Sogne', right_on='simplename', how='left')

    #keeping the missmatch
    miss = out[out.art.isna()].drop_duplicates('Sogne', keep='first')
    miss['year'] = f[2:6]
    missmatches.append(miss)
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('simplename').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)
    
#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)
missmatches = pd.concat(missmatches, sort=False)

#Saving the unique file in the HD
out_list = out_list.pivot('simplename', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts.txt', sep='$')
out_list.to_csv(_path+'../out/counts.tsv', sep='\t')
print('Done! :)')

ft1845_LL.txt
1489875
ft1850_LL.txt
1391708
ft1860_LL.txt
1715906
ft1880_LL.txt
1967615
ft1885_LL.txt
327936
ft1901_LL.txt
1979025
Done! :)


In [49]:
#Cities that could not be matched on the first round counts per year
missmatches.groupby('year').size().reset_index(name='counts').sort_values('counts', ascending=False)

Unnamed: 0,year,counts
5,1901,2039
3,1880,2016
0,1845,1775
2,1860,1701
1,1850,1624
4,1885,558


# Computing city similarity for missmatches

In [50]:
#Getting the potential first matches

mm = missmatches.Sogne.unique()
sn = dd.simplename.unique()

distance_jaro = np.zeros((len(sn),len(mm)))
distance_leven = np.zeros((len(sn),len(mm)))

for i in range(len(sn)):
    if i%100 == 0: print(i, 'out of' ,len(sn))
        
    for j in range(len(mm)):
        #If variable is none skip
        if not mm[j]: continue
        
        #This matrix is not simetric because x and y axis are not the same!
        try:
            distance_jaro[i][j] = distance.get_jaro_distance(sn[i],mm[j])
            distance_leven[i][j] = Levenshtein.distance(sn[i],mm[j])/(len(sn[i])+len(mm[j]))
        except:
            print('Could not make it',i,j)
            print(sn[i],mm[j])
        
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
distance_leven = pd.DataFrame(data = distance_leven, columns = mm, index = sn)
distance_leven.head()

0 out of 5800
100 out of 5800
200 out of 5800
300 out of 5800
400 out of 5800
500 out of 5800
600 out of 5800
700 out of 5800
800 out of 5800
900 out of 5800
1000 out of 5800
1100 out of 5800
1200 out of 5800
1300 out of 5800
1400 out of 5800
1500 out of 5800
1600 out of 5800
1700 out of 5800
1800 out of 5800
1900 out of 5800
2000 out of 5800
2100 out of 5800
2200 out of 5800
2300 out of 5800
2400 out of 5800
2500 out of 5800
2600 out of 5800
2700 out of 5800
2800 out of 5800
2900 out of 5800
3000 out of 5800
3100 out of 5800
3200 out of 5800
3300 out of 5800
3400 out of 5800
3500 out of 5800
3600 out of 5800
3700 out of 5800
3800 out of 5800
3900 out of 5800
4000 out of 5800
4100 out of 5800
4200 out of 5800
4300 out of 5800
4400 out of 5800
4500 out of 5800
4600 out of 5800
4700 out of 5800
4800 out of 5800
4900 out of 5800
5000 out of 5800
5100 out of 5800
5200 out of 5800
5300 out of 5800
5400 out of 5800
5500 out of 5800
5600 out of 5800
5700 out of 5800


Unnamed: 0,harte,almind,viuf,seest,sønder vilstrup,eltang,nørre bjert,nørre bramdrup,gudum,tjæreby,...,esbjerg købstad,simmelkær,struer,herborg,grove,voel,silkeborg landsogn,silkeborg købstad,hov,engesvang
kronborg,0.615385,0.571429,0.666667,0.615385,0.565217,0.5,0.421053,0.5,0.615385,0.533333,...,0.565217,0.529412,0.5,0.266667,0.461538,0.583333,0.538462,0.52,0.636364,0.470588
præstø,0.454545,0.5,0.6,0.363636,0.571429,0.5,0.588235,0.65,0.545455,0.461538,...,0.571429,0.6,0.5,0.461538,0.454545,0.6,0.666667,0.608696,0.666667,0.533333
bornholms,0.571429,0.6,0.692308,0.642857,0.541667,0.6,0.5,0.521739,0.571429,0.5625,...,0.5,0.5,0.533333,0.4375,0.5,0.538462,0.518519,0.5,0.583333,0.5
svendborg,0.642857,0.533333,0.615385,0.5,0.5,0.466667,0.45,0.521739,0.571429,0.5,...,0.5,0.444444,0.466667,0.25,0.571429,0.538462,0.481481,0.461538,0.666667,0.444444
ålborg,0.545455,0.416667,0.6,0.545455,0.619048,0.333333,0.529412,0.6,0.545455,0.538462,...,0.571429,0.533333,0.5,0.230769,0.454545,0.5,0.541667,0.521739,0.555556,0.533333


In [51]:
#Saving a mapping
distance_jaro.idxmax().to_csv(_path+'../out/mapping.tsv', sep='\t')

# Advanced matching
(This is the same as the basic matching but in this case it matches to the computed best match)

In [52]:
dfmap = pd.read_csv(_path+'../out/mapping.tsv', sep = '\t', dtype=str, names=['original','mapped'])
dfmap.head()

Unnamed: 0,original,mapped
0,harte,harte
1,almind,almind
2,viuf,viuf
3,seest,seest
4,sønder vilstrup,sønder vilstrup


In [None]:
_path = '../city_dump/data/utf8/'

#This is where I save the counts and then group it together
out_list = []


for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(_path+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing
    df = df[~df.Herred.isna()]
    print(len(df))
    
    #Preforming the name cleaning for Sogne, Herred, and Amt
    for working_column in ['Sogne','Herred','Amt']:
        df[working_column] = df.apply(lambda x: name_cleaner(x), axis=1)
    
    #Performing the matching for "inexisting places"
    df = df.merge(dfmap, left_on='Sogne', right_on='original', how='left')
    df.loc[~df.mapped.isna(), 'Sogne'] = df.loc[~df.mapped.isna(), 'mapped']


    # Focusing on Sogn that match the Sogn list in DigDag
    out = df.merge(dd[dd.art=='Sogn'][['simplename', 'art']], left_on='Sogne', right_on='simplename', how='left')
    
    #Counting the matches
    out = out[~out.art.isna()].groupby('simplename').size().reset_index(name='counts')
    
    # setting the year
    out['year'] = f[2:6]
    
    #Keeping the counts on RAM
    out_list.append(out)
    
#Concatenating all the files (the counts and missmatches)
out_list = pd.concat(out_list, sort=False)

#Saving the unique file in the HD
out_list = out_list.pivot('simplename', 'year', 'counts').fillna(0).astype(int)
out_list.to_csv(_path+'../out/counts_first_jaro.txt', sep='$')
out_list.to_csv(_path+'../out/counts_first_jaro.tsv', sep='\t')
print('Done! :)')

ft1845_LL.txt
1489875


In [None]:
# Creating a list so they can evaluate the matching correctness

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    if i % 100: print(i)
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
concatenate[['original','potential_match','score']].to_csv(_path+'../out/possible_matches.tsv', sep='\t')
concatenate.head(15)

In [None]:
#0/0

In [None]:
print('Perfect match:', len(concatenate[concatenate.score == 1].original.unique()))
print('To match:', len(concatenate[~concatenate.original.isin(concatenate[concatenate.score == 1].original.unique())].sort_values('score', ascending=False)))

In [None]:
concatenate[~concatenate.original.isin(concatenate[concatenate.score == 1].original.unique())].sort_values('score', ascending=False)