# Exploring Coincidences with a Cross-Verified database of Notable people

***Student:*** Lais Isabelle ALVES DOS SANTOS

## Libraries

In [41]:
# Libraries
import pandas as pd
from geopy.geocoders import Nominatim

In [45]:
# Instances
geolocator = Nominatim(user_agent="coincidences", timeout=10) 

## Data loading and pre-processing

In [60]:
df = pd.read_csv('../database/cross-verified-database.csv', 
                 encoding='latin-1',
                 delimiter=',',
                 usecols=['name', 'birth', 'death', 'level1_main_occ', 'gender', 'bplo1', 'dplo1', 'bpla1', 'dpla1']
                 )

In [62]:
# Replace some names to clarify the use of the dataset
df.rename(columns={"level1_main_occ": "domain", 
                   "bplo1": "birth_longitude", 
                   "dplo1": "death_longitude", 
                   "bpla1": "birth_latitude", 
                   "dpla1": "death_latitude"}, inplace=True)

# Leave only notable people from 'Discovery/Science'
df = df[df.domain == 'Discovery/Science']

# Drop all the rows with nan values
df = df.dropna()

# Reset indices
df.reset_index(drop=True)

Unnamed: 0,birth,death,gender,domain,name,birth_longitude,death_longitude,birth_latitude,death_latitude
0,1818.0,1894.0,Male,Discovery/Science,Joseph_Dienger,7.667770,8.400000,47.957283,49.016666
1,1907.0,1980.0,Female,Discovery/Science,Sylvia_Bayr-Klimpfinger,16.372499,16.372499,48.208332,48.208332
2,1891.0,1962.0,Male,Discovery/Science,Franz_Zimmermann_(Philologe),13.835800,12.375000,51.194901,51.340000
3,1935.0,2009.0,Male,Discovery/Science,Otto_Arndt_Liebisch,12.375000,9.866694,51.340000,52.499973
4,1929.0,2003.0,Male,Discovery/Science,Hans_ElsÃ¤sser,10.093611,8.710000,48.837223,49.412224
...,...,...,...,...,...,...,...,...,...
99579,1720.0,1800.0,Male,Discovery/Science,Jean-Baptiste_Le_Roy,2.351389,2.351389,48.856945,48.856945
99580,1548.0,1606.0,Male,Discovery/Science,Johann_Jakob_RÃ¼eger,8.633860,8.633860,47.696529,47.696529
99581,1934.0,2004.0,Male,Discovery/Science,Erik_FlÃ¼gel,16.083334,11.004444,47.049999,49.596390
99582,1926.0,2011.0,Male,Discovery/Science,Ulrich_Ricken,13.766666,7.031110,54.049999,49.276901


In [80]:
def compute_country(lat, lon):
    location = geolocator.reverse([lat, lon], language='en')
    return location.raw['address']['country']

In [81]:
countries = [compute_country(lat, lon) for lat, lon in zip(df['birth_latitude'], df['birth_longitude'])]

df['birth_country'] = countries

KeyError: 'country'

In [None]:
# Show some data after pre-processing
df.head()

Unnamed: 0,birth,death,gender,domain,name,birth_longitude,death_longitude,birth_latitude,death_latitude
6,1818.0,1894.0,Male,Discovery/Science,Joseph_Dienger,7.66777,8.4,47.957283,49.016666
7,1907.0,1980.0,Female,Discovery/Science,Sylvia_Bayr-Klimpfinger,16.372499,16.372499,48.208332,48.208332
9,1891.0,1962.0,Male,Discovery/Science,Franz_Zimmermann_(Philologe),13.8358,12.375,51.194901,51.34
17,1935.0,2009.0,Male,Discovery/Science,Otto_Arndt_Liebisch,12.375,9.866694,51.34,52.499973
28,1929.0,2003.0,Male,Discovery/Science,Hans_ElsÃ¤sser,10.093611,8.71,48.837223,49.412224


## Compute Unexpectedness

### Description complexity

In [None]:
# Place of birth feature



### Causal complexity