# Birthplace (fødested) cleaning
Here I inted to clean the birthplace (fødested) field in the data. Notice that the pre-processing scripts are in city_unification (conversion to UTF8, etc.)

## Libraries

In [1]:
#Importing libraries for data management
import pandas as pd
import numpy as np

#Importing libraries for system management
import os

#Distance computation
from pyjarowinkler import distance
#import Levenshtein

In [2]:
#Worning directories
in_folder  = '/data/import/nirama/utf8/'
out_folder = '/home/roregu/workspace/out/'
data_folder = '/home/roregu/workspace/data/'

#Column to process
focus_column = 'fødested'

#The Jaro threshold
THRESHOLD = 0.9

#Number of cores (machine = 28)
n_cores = 20

## Reading DigDag
and expanding with variations

In [3]:
# This function adds extra rows in the dataframe *dd* for the 
# places where *word* exists and replaces it with *replacement*
def adding_extra_rows(dd, word, replacement):
    dd['special'] = dd['simplename'].apply(lambda x: word in x) 
    dd['simplename2'] = dd.simplename.apply(lambda x: x.replace(word,replacement))
    return pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

In [4]:
#The digdag seems to have duplicates which I do not understand why
dd = pd.read_csv(data_folder+'rl_places_digdag_v1.txt', sep = '\t', encoding='utf-16', dtype=str)
dd_org = dd.copy()

#Getting the Købstad and putting it to the original data
dd2 = pd.read_csv(data_folder+'koebstad.csv', sep = ';', encoding='utf-8', dtype=str)
dd2['enhedtype'] = '80'#dd2['art'] 
dd = pd.concat([dd, dd2[~dd2.isna()][['navn','enhedid','enhedtype','art','simplename']]])
dd['simplename'] = dd['simplename'].astype(str)
del dd2

#Moving the Købstad to the right column
dd['art'] = np.where(dd['enhedtype'] == 'Købstad', ['Købstadskommune']*len(dd), dd['art'])

#adding a conversion of special characters to latin letters (å -> aa, ø -> oe, æ -> ae) and adding them to the reference list
dd = adding_extra_rows(dd, 'å', 'aa')
dd = adding_extra_rows(dd, 'ø', 'oe')
dd = adding_extra_rows(dd, 'æ', 'ae')

#Preparing the names to append them at the simple name
dd['art'] = dd['art'].apply({'Amt':'amt', 'Sogn':'sogn','Købstadskommune':'købstad', 'Geografisk Herred':'herred', 'Processing':''}.get)

#Adding a duplicated list of their unit type to increase matches
dd['simplename2'] = dd.apply(lambda row: str(row['simplename'])+' '+str(row['art']), axis=1)
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Adding the plural købstæder as well
dd = adding_extra_rows(dd, 'købstad', 'købstæder')

#Adding spaces instead of hyphens (new rows)
dd = adding_extra_rows(dd, '-', ' ')

#Removing spaces (new rows)
dd = adding_extra_rows(dd, ' ', '')

#Adding bysogn and landsogn (new rows)
dd = adding_extra_rows(dd, 'sogn', 'bysogn')
dd = adding_extra_rows(dd, 'sogn', 'landsogn')

#adding rows where the last letter's "e" will be removed
dd['special'] = dd['simplename'].str[-1] == 'e'
dd['simplename2'] = dd['simplename'].str[:-1]
dd = pd.concat([dd[['navn', 'enhedid', 'enhedtype', 'art', 'simplename']],dd[dd['special']][['navn', 'enhedid', 'enhedtype', 'art', 'simplename2']].rename(columns={'simplename2':'simplename'})])

#Dropping duplicates from digdag
#print(dd[dd.duplicated('simplename', keep=False)].sort_values('simplename').head(30))
dd.drop_duplicates(['art','simplename'], keep='first', inplace=True)

#Drop 'Non' from the list, this may be one of my artifacts...
dd = dd[(dd.simplename.apply(lambda x: not 'Non' in x))]

#Adding sweeden into the equation
dd = dd.append({'navn':'Sweden','enhedid':400000,'enhedtype':'90','art':'country','simplename':'sverige'}, ignore_index=True).append({'navn':'Sweden','enhedid':400000,'enhedtype':'90','art':'country','simplename':'sverrig'}, ignore_index=True)

# TODO take into consideration that the same city name can be in varios geographical areas...
# It should be matched with herred to increase accuracy for each record. However, right now we're only cleaning
# not sure how pressing the issue is...
nr_sogn = len(dd_org[dd_org.art=='Sogn'].simplename.unique()) 
dd.head(20)

#TODO: Right now I can map things to amt for example which should not be the case...
#dd =dd[dd.art='sogn']

Unnamed: 0,navn,enhedid,enhedtype,art,simplename
0,Kronborg Amt,118765,11,amt,kronborg
1,Præstø Amt,118791,11,amt,præstø
2,Bornholms Amt,118792,11,amt,bornholms
3,Svendborg Amt,118813,11,amt,svendborg
4,Ålborg Amt,118819,11,amt,ålborg
5,Sorø Amt,118785,11,amt,sorø
6,Århus Amt,118846,11,amt,århus
7,Hjørring Amt,118820,11,amt,hjørring
8,Bøvling Amt,118851,11,amt,bøvling
9,Vejle Amt,118849,11,amt,vejle


### Adding countries (Barbara's list, but missing countries before the german, and italian unifications)

In [5]:
#TODO

In [6]:
print('length original:\t', len(dd_org), '\nlength modified:\t', len(dd), '\nunique sogn keys:\t', nr_sogn)

length original:	 2519 
length modified:	 26874 
unique sogn keys:	 1924


## Processing functions

In [7]:
#Get the df from the $ separated file (for some reason pandas has problems with random lines)
def get_df(f_path):
    r= []
    columns = []
    with open(f_path) as f:
        first = True
        for line in f:
            line = line.rstrip().split('$')
            if first:
                length = len(line)
                columns = line
                first=False
            else:
                r.append(line[:length])
                
    return pd.DataFrame(data=r, columns = columns)

In [8]:
# Name cleaning function
def name_cleaner(s, working_column):
    try:
        o = s[working_column].lower().rstrip().replace('  ', ' ')
        #if it says "her i sogn", I get the value for "Sogn" and put it there
        if ('heri sogn' in o or 'her i sogn' in o or 'her i s.' == o or 'i sognet' == o) and working_column != 'Sogne':
            return name_cleaner(s, 'Sogne')
        return o
    except:
        return np.nan

In [9]:
# List of files to work (with the 1901 collapsed)
#This I did
working_data = ['ft1845_LL.txt',
 'ft1850_LL.txt',
 'ft1860_LL.txt',
 'ft1880_LL.txt',
 'ft1885_LL.txt',
 'ft1901_LL.txt']

#This Nicolai
working_data = ['ft1787.txt',
'ft1801.txt',
'ft1803.txt',
'ft1834.txt',
'ft1840.txt',
'ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

#This Nicolai
working_data = ['ft1845.txt',
'ft1850.txt',
'ft1860.txt',
'ft1880.txt',
'ft1885.txt',
'ft1901.txt']

## Loading data and counting

In [10]:
#This is where I save the counts and then group it together
out_list = []

#where I save the missmatches and file year
missmatches = []
total_records = 0

for f in working_data:
    
    print(f)
    #Loading the data
    df = get_df(in_folder+f)
    
    #Dropping empty rows due to the "mal-conversion" to utf8... The empty rows do contain data originally
    #TODO: Fix the conversion thing (ask Barbara to provide me the UTF-8...)
    df = df[~df[focus_column].isna() & (df[focus_column] !='')]
    print(len(df))
    total_records = total_records + len(df)
    
    #Preforming the name cleaning, be aware that I replace "her i sogn" in there
    df[focus_column+'_data'] = df.apply(lambda row: name_cleaner(row, focus_column), axis=1)
    
    #Reducing Dimensionality and saving in RAM
    out = df.groupby(focus_column+'_data').size().reset_index(name = 'counts')
    out['year'] = f[2:6]
    out_list.append(out)

    
out_list = pd.concat(out_list, sort=False)
print('Done! :D')
del df
total_people = out_list.counts.sum()
total_num_places = len(out_list[focus_column+'_data'])
out_list.sort_values('counts',ascending=False)

ft1845.txt
1281746
ft1850.txt
1359777
ft1860.txt
1700886
ft1880.txt
1759664
ft1885.txt
318056
ft1901.txt
2046684
Done! :D


Unnamed: 0,fødested_data,counts,year
19478,kjøbenhavn,131940,1885
53019,kjøbenhavn,85461,1860
70334,kjøbenhavn,69835,1880
74519,københavn,67981,1880
99006,københavn,58674,1901
...,...,...,...
87026,"staun, farstrup sogn, aalborg amt",1,1860
87025,"staun, brandenburg",1,1860
87023,staun sogn,1,1860
87022,staun i svansøe. s??by sogn,1,1860


In [11]:
#Putting it nicely, on a more horitzontal table
out_list = out_list.pivot(focus_column+'_data', 'year', 'counts').fillna(0).astype(int).reset_index().reset_index(drop=True)
out_list = pd.DataFrame(out_list.values, columns=out_list.columns.tolist()) #Getting rid of the wrong index
out_list['total'] = out_list.apply(lambda row: row['1845']+row['1850']+row['1860']+row['1880']+row['1885']+row['1901'],axis=1)
out_list.sort_values('total', ascending=False, inplace=True)
out_list.total.sum() #Around the 14k first places compule 80% of the people
out_list.head(50)

Unnamed: 0,fødested_data,1845,1850,1860,1880,1885,1901,total
272800,kjøbenhavn,46621,43550,85461,69835,131940,30187,407594
289033,københavn,14221,9331,16227,67981,14500,58674,180934
358825,odense,8196,6485,9269,12991,2967,23096,63004
18133,aalborg,5747,2691,8044,9797,2103,15702,44084
19805,aarhus,4044,4577,1710,3043,1818,23640,38832
222805,horsens,4105,3091,5681,8026,1390,12564,34857
201057,helsingør,1140,5801,6771,7643,2966,7898,32219
102856,do.,10,21321,6,5163,0,0,26500
140134,fredericia,3678,2438,5155,4212,918,8057,24458
376339,randers,1814,4050,7380,3309,1763,4867,23183


## Basic matches

In [12]:
#Notice that I only match for Sogn
out = out_list.merge(dd[['navn','simplename', 'art', 'enhedid']], left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_uniquevalues_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

2048


## Computing Jaro distances for the missmatches

In [None]:
#Getting the potential first matches

mm = miss[focus_column+'_data'].unique()
sn = dd_org.simplename.unique() # Using the short version of dig dag to make it much faster (>x10)

distance_jaro = np.zeros((len(sn),len(mm)))

def compute_jaro(a,b):
    try:
        return distance.get_jaro_distance(a, b)
    except:
        return 0

for i in range(len(sn)):
    if i%100 == 0: print(i, 'out of' , len(sn))
        
    #Computing jaro only
    distance_jaro[i] = miss[focus_column+'_data'].apply(lambda x: compute_jaro(sn[i], x)).values
    
        
#Index is the reference names found in digdag the columns the original data
distance_jaro = pd.DataFrame(data = distance_jaro, columns = mm, index = sn)
print(distance_jaro.shape, 'len mm,sn:', len(mm), len(sn)) #(26428, 1237)
distance_jaro.head()
del mm, sn

0 out of 2083
100 out of 2083
200 out of 2083
300 out of 2083
400 out of 2083
500 out of 2083
600 out of 2083
700 out of 2083
800 out of 2083


In [None]:
#distance_jaro.to_csv(out_folder+'jaro_birthplaces.tsv', sep='\t', index=True)
#distance_jaro.to_csv('/home/roregu/workspace/tmp/aux.tsv', sep='\t', index=True)

## Computing the best match with a minimum threshold

In [None]:


#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.to_csv(out_folder+'mapping_birthplaces.tsv', sep='\t', index=False)
dfmap = dfmap[dfmap.jaro.astype(float) >= THRESHOLD]
dfmap.head()
del a
del b

#Getting the mapped info into DigDag v2 (not really the v2)
dd2 = dfmap.merge(dd[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename', keep='first'), left_on='potential_match', right_on='simplename')
dd2['simplename'] = dd2['data_name']
dd2 = dd2[['navn','enhedid','art','simplename']]
dd2 = pd.concat([dd2[['navn','simplename', 'art', 'enhedid']], dd], sort=False) #putting together
dd2.head()

In [None]:
# Starting again with the merges
out = out_list.merge(dd2, left_on=focus_column+'_data', right_on='simplename', how='left')

#Those are the missmatch
miss = out[out.enhedid.isna()]
miss.head()

#Here are the match
match = out[~out.enhedid.isna()]
match.head()

# Aggregating the match by enhedid and year
agg_counts = match.groupby(['navn', 'enhedid', 'art'],as_index = False).agg({'1845':'sum', '1850':'sum','1860':'sum', '1880':'sum','1885':'sum', '1901':'sum'})

# Grouping all enhedid to see how many different versions of the same Sogne we have in the data
diff_counts = match.groupby(['navn','enhedid'])[focus_column+'_data'].apply(set).reset_index(name='extended_digdag') #if parenthesis is confusing add **.apply(lambda x: ', '.join(x))** before the reset index

# Joining DFs and saving
s = agg_counts.merge(diff_counts, on = ['navn','enhedid'])
s.to_csv(out_folder+'birthplaces_FT_jaro0.9_01.tsv', sep='\t', index=False)
print(len(s))
s.head()
del s, agg_counts, diff_counts

## Working with the missmatches after jaro

### Splitting commas and trying to match

In [None]:
#checking for bug remove if list is empty
miss[miss[focus_column+'_data'] == 'kjøbenhavn']

In [None]:
#Getting te commas string
commas = miss[miss[focus_column+'_data'].apply(lambda x: ',' in x)]
print('This many people have commas', commas.total.sum())
commas.head()

In [None]:
commas.head(50)

In [None]:
### See what couldn't be matched

In [None]:
miss.head()

In [None]:
dfmap

## Getting counts for other possible matches (broke here)

In [None]:
# It is on top: distance_jaro.to_csv('/home/roregu/workspace/tmp/aux.tsv')

In [None]:
0/0

In [None]:
# Get the best possible matches

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[(concatenate.score >= THRESHOLD) & ~concatenate.original.isna()].original.unique())].to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', index=False, sep='\t')

concatenate = pd.read_csv(out_folder+'birthplaces_possible_matches_jaro_01.tsv',  sep='\t')
concatenate.head(15)

In [None]:
#TODO: If running this remember to uncomment few cells above so it saves the new files

n_cores = 15
jump=int(606466/n_cores) #len(distance_jaro.columns))
a = 4
s = ''
for i in range(0,606466,jump):
    
    if a ==0:
        print(s)
        s=''
        a=4
    s = s + 'python top5_hits_par.py '+str(i)+' '+str(jump+i)+' ; '
    a = a-1
    
print(s) #This is how I make sure that my own processes don't get killed and I do not need to monitor

In [None]:
'python top5_hits_par.py '+str(i)+' '+str(jump+i)+' ; '

In [None]:
#This is the script I used to "parallelize" the selection of the top 5 matches per potential matches
import pandas as pd
import numpy as np
import sys
import os

_from = int(sys.argv[1])
_to = int(sys.argv[2])
_top = 5 #Top what? sys.argv[1]
THRESHOLD = 0.9

if os.path.isfile('/home/roregu/workspace/tmp/concatenate_'+str(_from)+'_'+str(_to)+'.tsv'):
    print('done')
    exit()

distance_jaro = pd.read_csv('/home/roregu/workspace/aux.tsv', sep = '\t', usecols=[0]+list(np.arange(_from,_to)), index_col=0 )

#print(distance_jaro.head())

concatenate = []
i=0
for col in distance_jaro.columns:
    i = i+1
    #if i % 100 == 0: print(i, 'out of', len(distance_jaro.columns))
    aux = distance_jaro.nlargest(5, col)[[col]]
    aux.columns = ['score']
    aux['original'] = col
    concatenate.append(aux)

#print(concatenate)
concatenate = pd.concat(concatenate, sort=False)
concatenate = concatenate.reset_index()
concatenate.columns = ['potential_match','score','original']
#print(concatenate)
concatenate[['original','potential_match','score']][~concatenate.original.isin(concatenate[(concatenate.score >= THRESHOLD) & ~concatenate.original.isna()].original.unique())].to_csv('/home/roregu/workspace/tmp/concatenate_'+str(_from)+'_'+str(_to)+'.tsv', index=False, sep='\t')

print('done_'+str(_from)+'_'+str(_to))

exit()

In [None]:
0/0

In [None]:
#Listing all files in directory 
concatenate = []
for directory, _, files in os.walk('/home/roregu/workspace/tmp/'):
    for file in files:
        concatenate.append(pd.read_csv(directory+file, sep = '\t', dtype=str))
                           
concatenate = pd.concat(concatenate, sort=False)
concatenate.head()

In [None]:
# Getting the digDag info for the matches but also for the non matches

m = concatenate.merge(dd2[['navn','simplename', 'art', 'enhedid']].sort_values('art', ascending=False).drop_duplicates('simplename',keep='first'), left_on='potential_match', right_on='simplename', how='inner')
m = m[['original','potential_match','score','navn','art','enhedid', 'simplename']]

w = dd2.sort_values('art', ascending=False).drop_duplicates('simplename',keep='first')
w['potential_match'] = w['simplename']
w['original'] = w['simplename']
w['score'] = 1

m = pd.concat([w, m], sort=True)


#Putting the things into place
m['original'] = np.where(m.original.isna(), m['simplename'], m['original'])

print(len(concatenate), len(m))
m.head()

In [None]:
#Getting the counts for the original plus the counts for the potential matches in DigDag
#Warning: Potential matches may not exist in the data...
out = m.merge(out_list, left_on='original', right_on=focus_column+'_data', how='left').merge(out_list, left_on='potential_match', right_on=focus_column+'_data', how='left').fillna(0)
for y in ['1845', '1850', '1860', '1880', '1885', '1901']:
    out[y] = out[y+'_x'] + out[y+'_y']
out = out[['original', 'potential_match', 'score', 'navn', 'art', 'enhedid', '1845', '1850', '1860', '1880', '1885', '1901']].sort_values('original')
out.to_csv(_path+'../out/birthplaces_possible_matches_jaro_01.tsv', sep='\t', index=False)

In [None]:
print('Done! :D')

In [None]:
import sys
sys.path.append('/data/import/nirama/cython/')
import jw_matrix


In [None]:
# Checking the validity of jaro 0.9

In [None]:
import random
import pandas as pd

df = pd.read_csv(out_folder+'birthplaces_FT_jaro0.9_01.tsv', sep = '\t', dtype=str)

max_example = 5

names    = df.navn.tolist()
extendeds = df.extended_digdag.tolist()

out = []
for name, extended in zip(names, extendeds):
    extended = list(eval(extended))
    #m = min(len(extended), max_example)
    samp = random.sample(extended, k= min(len(extended), max_example))
    for s in samp:
        out.append([name, s])

dfout = pd.DataFrame(out, columns= ['DigDag', 'matches (0.9)'])
dfout.to_csv(out_folder+'validity_jaro.tsv', sep='\t', index=False)

! cp $out_folder/validity_jaro.tsv /data/import/roc/validity_jaro.tsv

In [None]:
! ls /data/import/roc/

In [None]:
print('D')

In [None]:
dfout.shape

In [None]:
# List of non mapped 0.9

In [None]:
#Getting the highest match for each Sogne and filtering threshold
a = distance_jaro.idxmax().reset_index(name = 'potential_match')
b = distance_jaro.max().reset_index(name='jaro')
dfmap = a.merge(b, on='index').rename(columns={'index':'data_name'})
dfmap.head()

In [None]:
dfmap[dfmap.jaro < 0.9]