# Exploring Coincidences with a Cross-Verified database of Notable people

***Student:*** Lais Isabelle ALVES DOS SANTOS

## Libraries

In [46]:
# Libraries
import numpy as np
import pandas as pd
import pypopulation as pp
import country_converter as coco

from math import log
from geopy import distance as dist
from countryinfo import CountryInfo as ci

## Data loading and pre-processing

In [30]:
df = pd.read_csv('../database/cross-verified-database.csv', 
                 encoding='latin-1',
                 delimiter=',',
                 usecols=['name', 'birth', 'death', 'level1_main_occ', 'gender', 'bplo1', 'dplo1', 'bpla1', 'dpla1', 'area1_of_rattachment']
                 )

In [31]:
# Replace some names to clarify the use of the dataset
df.rename(columns={"level1_main_occ": "domain", 
                   "bplo1": "birth_longitude", 
                   "dplo1": "death_longitude", 
                   "bpla1": "birth_latitude", 
                   "dpla1": "death_latitude",
                   "area1_of_rattachment": "country"}, inplace=True)

# Leave only notable people from 'Discovery/Science'
df = df[df.domain == 'Leadership']

# Remove countries with names out of pattern
df = df[df.country != 'Old_(before_year_1990_AD)_Germany']

# Drop all the rows with nan values
df = df.dropna()

# Reset indices
df.reset_index(drop=True)

Unnamed: 0,birth,death,gender,domain,name,country,birth_longitude,death_longitude,birth_latitude,death_latitude
0,1884.0,1962.0,Male,Leadership,Joseph_C._O'Mahoney,US,-71.033333,-77.094643,42.391666,38.984825
1,1900.0,1972.0,Male,Leadership,Myron_V._George,US,-95.242500,-95.269699,37.568600,37.339199
2,1858.0,1932.0,Male,Leadership,Frank_P._Coburn,US,-91.086700,-91.233055,43.899700,43.813332
3,1907.0,1992.0,Male,Leadership,Meade_Alcorn,US,-98.579498,-72.691109,39.828175,41.983334
4,1844.0,1911.0,Male,Leadership,James_P._Latta,US,-82.315277,-92.462952,40.866943,44.023399
...,...,...,...,...,...,...,...,...,...,...
184288,1918.0,2006.0,Male,Leadership,Charles_Cutler,Australia,148.016663,149.101105,-33.383335,-33.281666
184289,1941.0,2012.0,Male,Leadership,Hans_Angerer_(Beamter),Germany,13.004167,11.575000,47.631390,48.137501
184290,1901.0,1942.0,Male,Leadership,Primo_Longobardo,Italy,9.408330,-30.000000,41.214241,0.000000
184291,1928.0,2018.0,Male,Leadership,John_Hall_Buchanan_Jr.,US,-88.313889,-77.152763,36.301109,39.084000


## Compute Unexpectedness

### Description complexity

In [71]:
def compute_description(lat_p1, lat_p2, long_p1, long_p2, const=20):
    d_p1 = (lat_p1, long_p1)
    d_p2 = (lat_p2, long_p2)

    distance = dist.distance(d_p1, d_p2).km
    print('dist: ', distance)
    print('cdist: ', log(1 + distance/const, 2))

    return int(log(1 + distance/const, 2))
    

### Generation complexity

In [74]:
def compute_generation(country):
    country_iso2 = coco.convert(names=[country], to='ISO2')
    pop = pp.get_population(country_iso2)
    area = ci(country).area()

    print('cpop:', log(pop, 2))
    print('carea:', log(area, 2))

    return int(log(pop, 2) + log(area, 2))

### Unexpectedness

In [77]:
unexpectedness = np.zeros(shape=(8,4,4))
for i in range(1):
    print(df['name'].values[i])
    for j in range(3):
        
        if (df['country'].values[i] == df['country'].values[j] and i != j):
            
            print(df['name'].values[j])
            desc = compute_description(df['birth_latitude'].values[i], df['birth_latitude'].values[j], df['birth_longitude'].values[i], df['birth_longitude'].values[j], const=1)
            gene = compute_generation(df['country'].values[i])
            print('desc:', desc)
            print('gene:', gene)
            print('result:', gene-desc)

Joseph_C._O'Mahoney
Myron_V._George
dist:  2128.07606804867
cdist:  11.0560117802654


cpop: 28.290173721908857
carea: 23.198968181339982
desc: 11
gene: 51
result: 40
Frank_P._Coburn
dist:  1635.8159999713694
cdist:  10.676676437354624
cpop: 28.290173721908857
carea: 23.198968181339982
desc: 10
gene: 51
result: 41


Questions:
1. How to compute the generation complexity when the country of different people are considered? We just sum up in the generation?
2. Should I consider the `const` value?