# Exploring Coincidences with a Cross-Verified database of Notable people

***Student:*** Lais Isabelle ALVES DOS SANTOS

## Libraries

In [61]:
# Libraries
import importlib
import pandas as pd
import functions as F

from math import log

In [57]:
# if the .py is changed, run to reload
importlib.reload(F)

<module 'functions' from 'd:\\Familia\\Lais\\Intercâmbio\\Télécom\\2A\\Creneaux D\\IA225\\mini_project\\ai225-mini-project-coincidences\\functions.py'>

## Data loading and pre-processing

In [58]:
# Load dataset with specific columns
df = pd.read_csv('../database/cross-verified-database.csv', 
                 encoding='latin-1',
                 delimiter=',',
                 usecols=['name', 'birth', 'death', 
                          'level1_main_occ', 'gender', 
                          'bplo1', 'dplo1', 'bpla1', 'dpla1', 'citizenship_1_b', 
                          'level3_main_occ', 'sum_visib_ln_5criteria']
                 )

In [59]:
# Replace some names to clarify the use of the dataset
df.rename(columns={"level1_main_occ": "domain", 
                   "bplo1": "birth_longitude", 
                   "dplo1": "death_longitude", 
                   "bpla1": "birth_latitude", 
                   "dpla1": "death_latitude",
                   "citizenship_1_b": "country", 
                   "level3_main_occ": "main_area",
                   "sum_visib_ln_5criteria": "popularity"}, inplace=True)

# Leave only notable people from 'Culture'
df = df[df.domain == 'Culture']
df = df[(df.main_area == 'actor') | (df.main_area == 'music')]
df = df[(df.country == 'US') | (df.country == 'France')]

# Change the rows of popularity to log base 2
df['popularity'] = df['popularity'].apply(lambda x: x / log(2))

# Eliminate rows with special characters from name
df = df[~df['name'].str.contains('[#,@,&]')]

# Drop all the rows with nan values
df = df.dropna()

# Reset indices
df.reset_index(drop=True)

Unnamed: 0,birth,death,gender,domain,name,main_area,popularity,country,birth_longitude,death_longitude,birth_latitude,death_latitude
0,1933.0,2014.0,Male,Culture,Buddy_Catlett,music,30.519879,US,-118.195618,-122.332069,33.768322,47.606209
1,1929.0,1999.0,Male,Culture,Buddy_Clark,music,28.321994,US,-87.845558,-118.287781,42.582222,34.155834
2,1937.0,2015.0,Male,Culture,Buddy_Emmons,music,35.617373,US,-86.171387,-86.774445,41.667500,36.162224
3,1919.0,2010.0,Male,Culture,Buddy_Hughes,music,22.213124,US,-92.346161,-88.063057,42.492435,41.871113
4,1924.0,2000.0,Male,Culture,Buddy_Jones,music,24.211248,US,-93.592224,-117.208817,33.667778,32.943661
...,...,...,...,...,...,...,...,...,...,...,...,...
13672,1955.0,2008.0,Male,Culture,Buck_Adams,actor,40.137432,US,-118.600281,-118.535828,34.257221,34.228329
13673,1927.0,2017.0,Male,Culture,Buck_Hill_(musician),music,30.237106,US,-77.036667,-76.885399,38.895000,38.994671
13674,1903.0,1955.0,Male,Culture,Buck_Washington,music,28.754349,US,-85.751389,-74.000000,38.256111,40.700001
13675,1935.0,2020.0,Male,Culture,Jean-Laurent_Cochet,actor,35.263958,France,2.436111,2.344447,48.883610,48.892151


## Compute Unexpectedness

In [60]:
# Defining the names of known people
music_category = "michael jackson"
actor_category = "paul walker"

# Get number of hints known people
hints_music = 912000000
hints_actor = 370000000

# Compute description complexity
desc_music = log(hints_music, 2)

desc_actor = log(hints_actor, 2)

In [None]:
thresh = 75         # threshold to filter interesting results
version = 'v10'     # version to save the tests

for i in range(len(df)):
    change_country = 1  # aux var to compute the generation of person 'i' only once

    # Compute informations and simplicity of person 'i'
    name1, lat1_b, lat1_d, long1_b, long1_d, country1_b, country1_d, area1, hits1 = F.define_constants(i, df)    
    desc_p1 = F.simplicity_person(area1, hits1, desc_music, desc_actor)

    for j in range(i + 1, len(df)):  
        # Compute informations and simplicity of person 'j'
        name2, lat2_b, lat2_d, long2_b, long2_d, country2_b, country2_d, area2, hits2 = F.define_constants(j, df)
        desc_p2 = F.simplicity_person(area2, hits2, desc_music, desc_actor)
        
        # Compute only for same countries of birth and death
        if (country1_b == country2_b and country1_d == country2_d):
            if (change_country):
                gene_birth = F.compute_generation(country1_b)
                gene_death = F.compute_generation(country1_d)
                change_country = 0

            # Compute description complexity places of birth and death
            desc_birth = F.compute_description(lat1_b, lat2_b, long1_b, long2_b)
            desc_death = F.compute_description(lat1_d, lat2_d, long1_d, long2_d)

            # Compute unexpectedness
            unex = gene_birth + gene_death - desc_birth - desc_death - desc_p1 - desc_p2

            # Print and save results if interesting
            if (unex > thresh):
                informations = [
                    [name1, name2],
                    [country1_b, F.get_city_name(lat1_b, long1_b), F.get_city_name(lat2_b, long2_b)],
                    [country1_d, F.get_city_name(lat1_d, long1_d), F.get_city_name(lat2_d, long2_d)],
                    [area1, area2],
                    [unex]
                ]

                print('Names: {} and {}\n'
                      'Born in {}, in the villages {} and {}, respectively.\n'
                      'Died in {}, in the villages {} and {}, respectively.\n'
                      'They were {} and {}, respectively.\n'
                      'Unexpectedness = {}\n\n'.format(
                        name1, name2,
                        country1_b, F.get_city_name(lat1_b, long1_b), F.get_city_name(lat2_b, long2_b),
                        country1_d, F.get_city_name(lat1_d, long1_d), F.get_city_name(lat2_d, long2_d),
                        area1, area2,
                        unex
                ))

                F.write_file(informations, version)