# Birthplace (fødested) cleaning
Here I inted to clean the birthplace (fødested) field in the data. Notice that the pre-processing scripts are in city_unification (conversion to UTF8, etc.)

In [111]:
print('start')

start


## Libraries

In [112]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance

#Regular expressions
import re

#Importing multiprocessing libraries
import multiprocessing

In [113]:
#Worning directories
in_folder  = '/data/import/nirama/utf8/'
out_folder = '/home/roregu/workspace/out/'
data_folder = '/home/roregu/workspace/data/'

#Column to process
focus_column = 'fødested'

#The Jaro threshold
THRESHOLD = 0.9

#Number of cores (machine = 28)
n_cores = 20

## Reading DigDag
and expanding with variations

In [114]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [115]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv(data_folder+'rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv(data_folder+'koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = '80'#dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)
del dd2

#Moving the Købstad to the right column
dd['art'] = np.where(dd['enhedtype'] == 'Købstad', ['Købstadskommune']*len(dd), dd['art'])

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn and landsogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')
dd = adding_extra_rows(dd, 'sogn', 'landsogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

#Drop 'Non' from the list, this may be one of my artifacts...
dd = dd[(dd.simplename.apply(lambda x: not 'Non' in x))]

#Adding sweeden into the equation
#dd = dd.append({'navn':'Sweden','enhedid':'400000','enhedtype':'90','art':'country','simplename':'sverige'}, ignore_index=True).append({'navn':'Sweden','enhedid':400000,'enhedtype':'90','art':'country','simplename':'sverrig'}, ignore_index=True)

# TODO take into consideration that the same city name can be in varios geographical areas...
# It should be matched with herred to increase accuracy for each record. However, right now we're only cleaning
# not sure how pressing the issue is...
nr_sogn = len(dd_org[dd_org.art=='Sogn'].simplename.unique()) 
dd.head(20)

#TODO: Right now I can map things to amt for example which should not be the case...
#dd =dd[dd.art='sogn']

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


### Adding countries and islands
Countries: Barbara's list, but missing countries before the german, and italian unifications

Islands: From mads from Danmark's Stednavne (You'll find the standardized name in the "opslagsform" column)

In [116]:
#Islands on the 400000 range, and type 200
df_islands = pd.read_csv(data_folder+'islands_edit.csv', sep = ';', dtype=str)
df_islands = df_islands[~df_islands.opslagsform.isna()]

#Adding Jylland
df_islands.loc[len(df_islands)] =  ['jylland']*len(df_islands.columns)

#Creating the columns
df_islands['navn'] = df_islands['opslagsform']
df_islands['simplename'] = df_islands['opslagsform'].str.lower()
df_islands['art'] = 'island'
df_islands['enhedtype'] = '200'
df_islands['enhedid'] = np.array(df_islands.index.tolist()) + 400000

dd_org = pd.concat([dd_org[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],df_islands[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']]])

# Removing special characters
df_islands = adding_extra_rows(df_islands, 'å', 'aa')
df_islands = adding_extra_rows(df_islands, 'ø', 'oe')
df_islands = adding_extra_rows(df_islands, 'æ', 'ae')

#Countries on the 500000 range, and type 300
df_countries = pd.read_csv(data_folder+'CountryList.csv', sep = ';', names = ['country'], dtype=str)

#Creating the columns
df_countries['navn'] = df_countries['country']
df_countries['simplename'] = df_countries['country'].str.lower()
df_countries['art'] = 'country'
df_countries['enhedtype'] = '300'
df_countries['enhedid'] = np.array(df_countries.index.tolist()) + 500000

dd_org = pd.concat([dd_org[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],df_countries[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']]])

# Removing special characters
df_countries = adding_extra_rows(df_countries, 'å', 'aa')
df_countries = adding_extra_rows(df_countries, 'ø', 'oe')
df_countries = adding_extra_rows(df_countries, 'æ', 'ae')

#Adding countries and islands to dig dag dataframe
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],df_islands[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],df_countries[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']]])
dd.head()

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg


In [117]:
print('length original:\t', len(dd_org), '\nlength modified:\t', len(dd), '\nunique sogn keys:\t', nr_sogn)

length original:	 2739 
length modified:	 27235 
unique sogn keys:	 1924


## Processing functions

In [118]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [120]:
# Name cleaning function
def name_cleaner(s, working_column):
    try:
        o = s[working_column].lower().strip().replace('  ', ' ').replace('*', '').replace('?', '').replace('.', '').replace(':', '').replace('"', '')
        
        #If "do" (ditto= same as above) and people write what is same, then I keep what they wrote
        if o[:3] == 'do ':
            o = re.findall(r'do\.?\s\[?\(?(.+)',o)[0].rstrip(']').rstrip(')')
           
        #if it says "her i sogn", I get the value for "Sogn" and put it there
        if ('heri sogn' in o or 'her i sogn' in o or 'her i s' == o or 'i sognet' == o[:8] or 'i sogn' == o or 'født i sognet' == o or 'sognet' == o or 'her' == o or 'i sog' == o or 'her i sog' == o or '[i sognet]' == o or 'her i byen'  == o or 'h i s'  == o or 'i s'  == o or 'heri s'  == o or 'h i sognet' == o) and working_column != 'Sogne':
            return name_cleaner(s, 'Sogne')
                    
        #Get some clarifications
        if o[:4] == 'her ':
            o = re.findall(r'her\.?\s\[?\(?(.+)',o)[0]
            
        # Removing variability not catched by Jaro for Copenhagen
        if o in 'kbhvn*kbhv*kjøbenh*kjøbh*kbhvn*kbhavn*kjøbhvn':
            return 'kobenhavn'

        if o == 'i s]': print('***#*',s[working_column])
        return o
    except:
        return np.nan

In [121]:
# List of files to work (with the 1901 collapsed)
#This I did
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

#This Nicolai
working_data = ['ft1787.txt',
'ft1801.txt',
'ft1803.txt',
'ft1834.txt',
'ft1840.txt',
'ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

#This Nicolai
working_data = ['ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

## Loading data and counting

In [122]:
#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []
total_records = 0

#for f in working_data:
 
def data_loader(f):
    
    print(f)
    #Loading the data
    df = get_df(in_folder+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    df = df[~df[focus_column].isna() & (df[focus_column] !='')]
    print(len(df))
    
    #Preforming the name cleaning, be aware that I replace "her i sogn" in there
    df[focus_column+'_data'] = df.apply(lambda row: name_cleaner(row, focus_column), axis=1)
    
    #Reducing Dimensionality and saving in RAM
    out = df.groupby(focus_column+'_data').size().reset_index(name = 'counts')
    out['year'] = f[2:6]
    return out

with multiprocessing.Pool(processes=n_cores) as pool:
    for result in pool.starmap(data_loader, zip(working_data)):
        out_list.append(result)

out_list = pd.concat(out_list, sort=False)
out_list.sort_values('counts',ascending=False)
print('Done! :D')

ft1845.txt
ft1850.txt
ft1860.txt
ft1880.txt
ft1901.txt
ft1885.txt
318056
1281746
1359777
1700886
1759664
2046684
Done! :D


In [123]:
#Putting it nicely, on a more horitzontal table
out_list = out_list.pivot(focus_column+'_data', 'year', 'counts').fillna(0).astype(int).reset_index().reset_index(drop=True)
out_list = pd.DataFrame(out_list.values, columns=out_list.columns.tolist()) #Getting rid of the wrong index
out_list['total'] = out_list.apply(lambda row: row['1845']+row['1850']+row['1860']+row['1880']+row['1885']+row['1901'],axis=1)
out_list.sort_values('total', ascending=False, inplace=True)
out_list.total.sum() #Around the 14k first places compule 80% of the people
out_list.head(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total
236183,kjøbenhavn,47211,43847,86384,72070,133009,30668,413189
251400,københavn,15088,9384,16227,68006,14513,58965,182183
317066,odense,8325,8266,10376,14396,2971,23120,67454
240937,kobenhavn,9554,7159,11087,8218,624,8675,45317
10150,aalborg,5936,2748,8061,9846,2104,15714,44409
11697,aarhus,4054,4577,1782,3066,1820,23659,38958
169468,helsingør,4905,5804,6789,7661,2970,7963,36092
78543,do,10,29824,439,5187,0,145,35605
189232,horsens,4131,3113,5693,8051,1391,12577,34956
113102,fredericia,3716,2470,5165,5178,922,8074,25525


# Name splitting 
if there is sogn and amt in the same value I keep the amt and save the rest

In [124]:
def do_splitting(row):
    r = row[focus_column+'_data'].split(',')
    
    try:
        s = r[0]
        a = r[1]
    except:
        s = r[0]
        a = r[0]
    
    for i in range(len(r)):
        if 'sogn' in r[i] or 'købstad' in r[i]: 
            if 'sogn' == r[i].strip(' ') or 'købstad' == r[i].strip(' ') : s = r[i-1]
            else: s = r[i]
        elif 'amt' in r[i]: 
            if 'amt' == r[i].strip(' '): a = r[i-1]
            else: a = r[i]
                
    #When the comma does not separate
    if s == a:
    
        #When sogn is written but not separatos is placed between sogn and amt
        if(a[-4:] != 'sogn' or a[-7:] != 'købstad'):
            if 'sogn' in a:
                r = a.split('sogn')
                return r[0], r[1]
            elif 'købstad' in a:
                r = a.split('købstad')
                return r[0], r[1]
    
        #When they write " s " instead of sogn, use it as a sign to separate
        if ' s ' in s:
            s = s.split(' s ')
            a = s[1]
            s = s[0]
    
    return s, a


#commas[[focus_column+'_sogn_data', focus_column+'_amt_data']] = commas[focus_column+'_data'].str.split(",", n=1,expand=True) 
out_list[focus_column+'_org'] = out_list[focus_column+'_data']
out_list[[focus_column+'_data', focus_column+'_amt_data']] = out_list.apply(do_splitting, axis=1, result_type="expand")
out_list

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data
236183,kjøbenhavn,47211,43847,86384,72070,133009,30668,413189,kjøbenhavn,kjøbenhavn
251400,københavn,15088,9384,16227,68006,14513,58965,182183,københavn,københavn
317066,odense,8325,8266,10376,14396,2971,23120,67454,odense,odense
240937,kobenhavn,9554,7159,11087,8218,624,8675,45317,kobenhavn,kobenhavn
10150,aalborg,5936,2748,8061,9846,2104,15714,44409,aalborg,aalborg
...,...,...,...,...,...,...,...,...,...,...
215944,jerslev lille fugle,0,0,0,0,0,1,1,jerslev lille fugle sogn,
215943,jerslev kalundborg,0,0,0,1,0,0,1,jerslev kalundborg,jerslev kalundborg
215941,jerslev i sverige,0,0,1,0,0,0,1,jerslev i sverige,jerslev i sverige
215940,jerslev i jylland,0,0,0,0,0,1,1,jerslev i jylland,jerslev i jylland


In [125]:
#Todo: Check wich sogn are repeated within different amts and treat them differently

## Basic matches

In [126]:
#Notice that I only match for Sogn
out = out_list.merge(dd[['navn','simplename', 'art', 'enhedid']], left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_uniquevalues_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

2229


## Computing Jaro distances for the missmatches

In [127]:
def compute_jaro(a, bb, i):
    o=[]
    for b in bb:
        try:
            o.append(distance.get_jaro_distance(a, b))
        except:
            o.append(0)
    return i, o


mm = miss[focus_column+'_data'].unique()
sn = dd_org.simplename.unique() # Using the short version of dig dag to make it much faster (>x10)
distance_jaro = np.zeros((len(sn),len(mm)))


with multiprocessing.Pool(processes=n_cores) as pool:
    for i , result in pool.starmap(compute_jaro, zip(sn, [mm]*len(sn), range(len(sn)))):
        if i%100 == 0: print(i, 'out of' , len(sn))
        distance_jaro[i,:] = result

        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
del mm, sn
distance_jaro.head()

0 out of 2250
100 out of 2250
200 out of 2250
300 out of 2250
400 out of 2250
500 out of 2250
600 out of 2250
700 out of 2250
800 out of 2250
900 out of 2250
1000 out of 2250
1100 out of 2250
1200 out of 2250
1300 out of 2250
1400 out of 2250
1500 out of 2250
1600 out of 2250
1700 out of 2250
1800 out of 2250
1900 out of 2250
2000 out of 2250
2100 out of 2250
2200 out of 2250
(2250, 328779) len mm,sn: 328779 2250


Unnamed: 0,kjøbenhavn,kobenhavn,do,sverrig,-,slesvig,kiøbenhavn,veile,aarhuus,sverig,...,jerslev i hjørring amt,jerslev pr køge,jerslev pr kallundborg,jerslev pr haslev,jerslev paa mors,jerslev lille fugle,jerslev kalundborg,jerslev i sverige,jerslev i jylland,…sløv holbæk amt
kronborg,0.5,0.61,0.54,0.6,0.0,0.42,0.5,0.0,0.42,0.53,...,0.39,0.46,0.45,0.46,0.4,0.39,0.39,0.39,0.39,0.4
præstø,0.42,0.0,0.0,0.37,0.0,0.44,0.42,0.0,0.54,0.39,...,0.47,0.49,0.47,0.48,0.49,0.48,0.48,0.48,0.48,0.49
bornholms,0.54,0.55,0.54,0.42,0.0,0.0,0.54,0.0,0.59,0.43,...,0.44,0.45,0.44,0.45,0.45,0.44,0.44,0.5,0.45,0.4
svendborg,0.43,0.44,0.0,0.83,0.0,0.63,0.43,0.54,0.0,0.79,...,0.27,0.4,0.38,0.39,0.28,0.27,0.28,0.28,0.28,0.48
ålborg,0.42,0.35,0.0,0.54,0.0,0.54,0.42,0.46,0.44,0.56,...,0.31,0.32,0.31,0.32,0.32,0.31,0.31,0.32,0.32,0.41


## Computing the best match with a minimum threshold

In [128]:
#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(out_folder+'mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()
del a
del b

#Getting the mapped info into DigDag v2 (not really the v2)
dd2 = dfmap.merge(dd[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd2['simplename'] = dd2['data_name']
dd2 = dd2[['navn','enhedid','art','simplename']]
dd2 = pd.concat([dd2[['navn','simplename', 'art', 'enhedid']], dd], sort=False) #putting together
dd2.head()

Unnamed: 0,navn,simplename,art,enhedid,enhedtype
0,Staden København,kjøbenhavn,købstad,120663,
1,Staden København,kobenhavn,købstad,120663,
2,Staden København,kiøbenhavn,købstad,120663,
3,Staden København,kjöbenhavn,købstad,120663,
4,Staden København,kiöbenhavn,købstad,120663,


In [129]:
# Starting again with the merges
out = out_list.merge(dd2, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_jaro0.9_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

2377


In [130]:
miss.head(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data,navn,simplename,art,enhedid,enhedtype
18,do,10,29824,439,5187,0,145,35605,do,do,,,,,
39,-,1562,491,1054,1356,457,13869,18789,-,-,,,,,
44,slesvig,287,3368,1780,7331,844,3683,17293,slesvig,slesvig,,,,,
89,veile,330,1970,2729,3388,665,502,9584,veile,veile,,,,,
96,aarhuus,1475,1433,5318,547,391,180,9344,aarhuus,aarhuus,,,,,
206,fakse,1109,1725,1919,42,8,122,4925,fakse,fakse,,,,,
289,aalborg aalborg amt,0,3726,2,25,0,118,3871,aalborg aalborg amt,aalborg aalborg amt,,,,,
461,nordrup sogn,83,360,870,536,0,789,2638,"nordrup sogn, sorø amt",sorø amt,,,,,
523,møgeltønder (kongerigsk),893,950,546,0,0,0,2389,møgeltønder (kongerigsk),møgeltønder (kongerigsk),,,,,
566,taars,374,94,19,1050,0,753,2290,taars sogn hjørring amt,hjørring amt,,,,,


In [139]:
miss[~miss['fødested_data'].apply(lambda x: 'sogn' in x or ' amt' in x or ' by' in x or ' pr ' in x)].head(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data,navn,simplename,art,enhedid,enhedtype
18,do,10,29824,439,5187,0,145,35605,do,do,,,,,
39,-,1562,491,1054,1356,457,13869,18789,-,-,,,,,
44,slesvig,287,3368,1780,7331,844,3683,17293,slesvig,slesvig,,,,,
89,veile,330,1970,2729,3388,665,502,9584,veile,veile,,,,,
96,aarhuus,1475,1433,5318,547,391,180,9344,aarhuus,aarhuus,,,,,
206,fakse,1109,1725,1919,42,8,122,4925,fakse,fakse,,,,,
523,møgeltønder (kongerigsk),893,950,546,0,0,0,2389,møgeltønder (kongerigsk),møgeltønder (kongerigsk),,,,,
566,taars,374,94,19,1050,0,753,2290,taars sogn hjørring amt,hjørring amt,,,,,
595,saaby,252,842,864,228,4,8,2198,saaby,saaby,,,,,
644,kiel,424,282,680,341,223,115,2065,kiel,kiel,,,,,


In [136]:
miss[miss['fødested_data'].apply(lambda x: ' pr ' in x )]

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data,navn,simplename,art,enhedid,enhedtype
14518,nordby pr ribe,0,0,0,0,0,54,54,nordby pr ribe,nordby pr ribe,,,,,
17189,rødding pr skive,1,0,0,2,0,38,41,rødding pr skive,rødding pr skive,,,,,
18607,rask mark pr rask st,0,0,0,0,0,36,36,rask mark pr rask st,rask mark pr rask st,,,,,
21497,resen pr skive,0,0,0,3,0,26,29,resen pr skive,resen pr skive,,,,,
23101,seest pr kolding,0,0,3,4,0,19,26,seest pr kolding,seest pr kolding,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543979,jersi pr roskilde,0,1,0,0,0,0,1,jersi pr roskilde,jersi pr roskilde,,,,,
544007,jersie pr kjøge,0,0,0,1,0,0,1,jersie pr kjøge,jersie pr kjøge,,,,,
544107,jerslev pr køge,0,0,0,1,0,0,1,jerslev pr køge,jerslev pr køge,,,,,
544108,jerslev pr kallundborg,0,0,0,0,0,1,1,jerslev pr kallundborg,jerslev pr kallundborg,,,,,


In [132]:
miss.tail(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data,navn,simplename,art,enhedid,enhedtype
543976,jersi sogn,0,0,0,0,0,1,1,"jersi sogn, roskilde amt",roskilde amt,,,,,
543977,jersi sogn,0,0,1,0,0,0,1,"jersi sogn, kbhvns a",kbhvns a,,,,,
543979,jersi pr roskilde,0,1,0,0,0,0,1,jersi pr roskilde,jersi pr roskilde,,,,,
543980,jersi by roskilde amt,0,0,0,0,0,1,1,jersi by roskilde amt,jersi by roskilde amt,,,,,
543982,jersgaard glud,0,0,0,0,0,1,1,jersgaard glud sogn vejle amt,vejle amt,,,,,
543984,jersev sogn,0,0,0,1,0,0,1,"jersev sogn, holbech amt",holbech amt,,,,,
543985,jersie i hkeks amt,0,0,0,1,0,0,1,jersie i hkeks amt,jersie i hkeks amt,,,,,
543986,jersie kjøbenhavn amt [roskilde amt],0,0,0,0,0,1,1,jersie kjøbenhavn amt [roskilde amt],jersie kjøbenhavn amt [roskilde amt],,,,,
544000,jersie københavn amt,0,0,0,1,0,0,1,jersie københavn amt,jersie københavn amt,,,,,
544004,jersie s kjøbh [jersie,0,0,0,0,0,1,1,jersie s kjøbh [jersie sogn københavns amt],københavns amt],,,,,


In [133]:
0/0

ZeroDivisionError: division by zero

In [None]:
distance.get_jaro_distance('sverrig', 'sverige')

## Working with the missmatches after jaro

### Some of the missmatches have sogn in the name, so I will do another jaro for "XXXX sogn"

In [140]:
to_split = miss[miss['fødested_data'].apply(lambda x: 'sogn' in x or ' amt' in x or ' by' in x or ' pr ' in x)]

def get_sogn_v2(s):
    
    #Doing the splitting part
    for elm in ['sogn', ' by ', ' pr ']:
        if elm in s:
            return s.split(elm)[0].rstrip()
        
    # if there is nothing to split
    if ' amt' in s:
        return s.split()[0].rstrip()

to_split[focus_column+'_data'] = to_split[focus_column+'_data'].apply(get_sogn_v2)
to_split.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total,fødested_org,fødested_amt_data,navn,simplename,art,enhedid,enhedtype
289,aalborg,0,3726,2,25,0,118,3871,aalborg aalborg amt,aalborg aalborg amt,,,,,
461,nordrup,83,360,870,536,0,789,2638,"nordrup sogn, sorø amt",sorø amt,,,,,
619,kjøng,149,168,197,1529,0,83,2126,"kjøng sogn, odense amt",odense amt,,,,,
682,veilby,86,56,49,1655,0,141,1987,"veilby sogn, odense amt",odense amt,,,,,
902,egtved,8,0,1,67,0,1524,1600,egtved vejle amt,egtved vejle amt,,,,,


In [141]:
def compute_jaro(a, bb, i):
    o=[]
    for b in bb:
        try:
            o.append(distance.get_jaro_distance(a, b))
        except:
            o.append(0)
    return i, o


mm = to_split[focus_column+'_data'].unique()
sn = dd[dd.simplename.str[-5:] == ' sogn'].simplename.unique() # Using the short version of dig dag to make it much faster (>x10)
distance_jaro = np.zeros((len(sn),len(mm)))


with multiprocessing.Pool(processes=n_cores) as pool:
    for i , result in pool.starmap(compute_jaro, zip(sn, [mm]*len(sn), range(len(sn)))):
        if i%100 == 0: print(i, 'out of' , len(sn))
        distance_jaro[i,:] = result

        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
del mm, sn
distance_jaro.head()

0 out of 2642
100 out of 2642
200 out of 2642
300 out of 2642
400 out of 2642
500 out of 2642
600 out of 2642
700 out of 2642
800 out of 2642
900 out of 2642
1000 out of 2642
1100 out of 2642
1200 out of 2642
1300 out of 2642
1400 out of 2642
1500 out of 2642
1600 out of 2642
1700 out of 2642
1800 out of 2642
1900 out of 2642
2000 out of 2642
2100 out of 2642
2200 out of 2642
2300 out of 2642
2400 out of 2642
2500 out of 2642
2600 out of 2642
(2642, 52176) len mm,sn: 52176 2642


Unnamed: 0,aalborg,nordrup,kjøng,veilby,egtved,aistrup,kjerte,kjærum,hillerød,gjelsted,...,jersil,jersige by og,jersie by og,roeskilde [jersie,jersev,jerskou,jerskov,jersl,jerslev s do amt [jerslev,…sløv
flakkebjerg sogn,0.43,0.0,0.0,0.49,0.41,0.4,0.56,0.41,0.52,0.41,...,0.0,0.61,0.51,0.52,0.41,0.4,0.4,0.0,0.52,0.42
frøslev sogn,0.41,0.41,0.43,0.42,0.33,0.32,0.33,0.42,0.44,0.43,...,0.58,0.49,0.61,0.5,0.67,0.48,0.56,0.62,0.58,0.63
kirke såby sogn,0.4,0.4,0.48,0.32,0.41,0.47,0.51,0.54,0.41,0.46,...,0.56,0.75,0.77,0.53,0.46,0.45,0.45,0.42,0.44,0.0
kirke stillinge sogn,0.4,0.4,0.48,0.31,0.41,0.46,0.58,0.53,0.4,0.51,...,0.54,0.56,0.5,0.57,0.44,0.42,0.42,0.42,0.5,0.0
herlev sogn,0.46,0.41,0.0,0.51,0.48,0.41,0.59,0.42,0.58,0.54,...,0.59,0.63,0.69,0.46,0.68,0.57,0.56,0.62,0.6,0.53


In [None]:
#STOP HERE

In [142]:
#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(out_folder+'mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()
del a
del b

#Getting the mapped info into DigDag v2 (not really the v2)
dd3 = dfmap.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd3['simplename'] = dd3['data_name']
dd3 = dd3[['navn','enhedid','art','simplename']]
dd3 = pd.concat([dd3[['navn','simplename', 'art', 'enhedid']], dd2], sort=False) #putting together
dd3.head()

Unnamed: 0,navn,simplename,art,enhedid,enhedtype
0,Egtved Sogn,egtved,sogn,113178,
1,Hillerød Sogn,hillerød,sogn,113541,
2,Hillerød Sogn,hillers,sogn,113541,
3,Hillerød Sogn,hilleød,sogn,113541,
4,Hillerød Sogn,hillersø,sogn,113541,


In [None]:
# Starting again with the merges
out = out_list.merge(dd3, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_jaro0.9_02.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

In [None]:
dd_org[dd_org.simplename.str[:5] == 'køben']

In [None]:
distance.get_jaro_distance('kjøbenhavn', 'københavn')

In [None]:
print(miss.shape)
miss.head(50)

In [None]:

0/0


# Adding DigDag info and saving 

In [None]:
# Getting the digDag info for the matches but also for the non matches

m = concatenate.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename',keep='first'), left_on='potential_match', right_on='simplename', how='inner')
m = m[['original','potential_match','score','navn','art','enhedid', 'simplename']]

w = dd2.sort_values('art', ascending=False).drop_duplicates('simplename',keep='first')
w['potential_match'] = w['simplename']
w['original'] = w['simplename']
w['score'] = 1

m = pd.concat([w, m], sort=True)


#Putting the things into place
m['original'] = np.where(m.original.isna(), m['simplename'], m['original'])

print(len(concatenate), len(m))
m.head()

In [None]:
#Getting the counts for the original plus the counts for the potential matches in DigDag
#Warning: Potential matches may not exist in the data...
out = m.merge(out_list, left_on='original', right_on=focus_column+'_data', how='left').merge(out_list, left_on='potential_match', right_on=focus_column+'_data', how='left').fillna(0)
for y in ['1845', '1850', '1860', '1880', '1885', '1901']:
    out[y] = out[y+'_x'] + out[y+'_y']
out = out[['original', 'potential_match', 'score', 'navn', 'art', 'enhedid', '1845', '1850', '1860', '1880', '1885', '1901']].sort_values('original')
out.to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', sep='\t', index=False)

In [None]:
print('Done! :D')

## Generating file to check the validity of the mapping jaro 0.9

In [None]:
import random
import pandas as pd

df = pd.read_csv(out_folder+'birthplaces_FT_jaro0.9_02.tsv', sep = '\t', dtype=str)

max_example = 5

names    = df.navn.tolist()
extendeds = df.extended_digdag.tolist()

out = []
for name, extended in zip(names, extendeds):
    extended = list(eval(extended))
    #m = min(len(extended), max_example)
    samp = random.sample(extended, k= min(len(extended), max_example))
    for s in samp:
        out.append([name, s])

dfout = pd.DataFrame(out, columns= ['DigDag', 'matches (0.9)'])
dfout.to_csv(out_folder+'validity_jaro.tsv', sep='\t', index=False)

! cp $out_folder/validity_jaro.tsv /data/import/roc/validity_jaro.tsv
! cp $out_folder/validity_jaro.tsv /home/roregu/workspace/git_out/downloading_stuff/validity_jaro.tsv

print('Saved!')

In [None]:
pd.read_csv(out_folder+'birthplaces_FT_jaro0.9_02.tsv', sep = '\t', dtype=str)