In [1]:
import pandas as pd

df_person = pd.read_excel("Search_collection_id_9327_schema_Person.xlsx")
df_address = pd.read_excel("Search_collection_id_9327_schema_Address.xlsx")

print("Person columns:", df_person.columns.tolist())
print("Address columns:", df_address.columns.tolist())

print("\nSample Person rows:")
print(df_person.head())

print("\nSample Address rows:")
print(df_address.head())

Person columns: ['ID', 'url', 'collection', 'Name', 'E-Mail', 'Last name', 'Phone', 'Birth date', 'Nationality', 'Address', 'Bureau van Dijk ID', 'Classification', 'Country', 'Country of birth', 'Country of origin', 'Created at', 'D-U-N-S', 'Death date', 'Description', 'Dissolution date', 'Education', 'Ethnicity', 'First name', 'Gender', 'ICIJ ID', 'ID Number', 'INN', 'Incorporation date', 'Jurisdiction', 'Keywords', 'LEI', 'Legal form', 'Matronymic', 'Middle name', 'Modified on', 'Name suffix', 'Notes', 'OGRN', 'OKPO', 'OpenCorporates URL', 'Other name', 'Passport number', 'Patronymic', 'Place of birth', 'Political association', 'Position', 'Previous name', 'Program', 'Publishing source', 'Publishing source URL', 'Registration number', 'Religion', 'Retrieved on', 'SWIFT/BIC', 'Second name', 'Sector', 'Source link', 'Status', 'Summary', 'Tax Number', 'Tax status', 'Title', 'Topics', 'V.A.T. Identifier', 'Weak alias', 'Website', 'Wikidata ID', 'Wikipedia Article']
Address columns: ['ID'

In [2]:
# Number of rows
print("People:", len(df_person))
print("Addresses:", len(df_address))

# Sample addresses from both
print("\nPerson address samples:")
print(df_person['Address'].dropna().head(10))

print("\nAddress full address samples:")
print(df_address['Full address'].dropna().head(10))

# Try fuzzy matching overlap
person_addresses = set(df_person['Address'].dropna().unique())
full_addresses = set(df_address['Full address'].dropna().unique())

matches = person_addresses.intersection(full_addresses)
print(f"\nExact matches found: {len(matches)}")
print(list(matches)[:20])


People: 4472
Addresses: 1332

Person address samples:
1                                               IRAK
2                    Cluj-Napoca, jud. CLUJ, ROMANIA
4                     Focsani, jud. VRANCEA, ROMANIA
5                   Jilavele, jud. IALOMITA, ROMANIA
6                  Aiudul de Sus, jud. ALBA, ROMANIA
7                  Piatra Neamt, jud. NEAMT, ROMANIA
8                 Baia Mare, jud. MARAMURES, ROMANIA
9                  Beica de Jos, jud. MURES, ROMANIA
10                      Braila, jud. BRAILA, ROMANIA
11    Drobeta-Turnu Severin, jud. MEHEDINTI, ROMANIA
Name: Address, dtype: object

Address full address samples:
0            Pacureti, jud. PRAHOVA, ROMANIA
1               Adunati, jud. ARGES, ROMANIA
2                Niuved, jud. BIHOR, ROMANIA
3            Bunea Mare, jud. TIMIS, ROMANIA
4                Uiasca, jud. ARGES, ROMANIA
5             Sacel, jud. MARAMURES, ROMANIA
6           Urechesti, jud. VRANCEA, ROMANIA
7             Nejlovelu, jud. ARGES, ROMA

In [10]:
import pandas as pd

# Load
df_person = pd.read_excel("Search_collection_id_9327_schema_Person.xlsx")
df_address = pd.read_excel("Search_collection_id_9327_schema_Address.xlsx")

# Drop NaNs so we don’t join on empty strings
df_person = df_person.dropna(subset=["Address"])
df_address = df_address.dropna(subset=["Full address"])

# Merge on the shared address text
df_merged = df_person.merge(
    df_address,
    left_on="Address",
    right_on="Full address",
    suffixes=("_person", "_addr")
)

print(f"Merged dataset has {len(df_merged)} rows")

# Check merged sample
print(df_merged[["Name_person", "Address_person", "Latitude", "Longitude"]].head())

Merged dataset has 3564 rows
                  Name_person                     Address_person  Latitude  \
0  KASHAN SALHLADIN ABDUSALAM                               IRAK       NaN   
1                  RAPAS IOAN    Cluj-Napoca, jud. CLUJ, ROMANIA       NaN   
2   FRATILA CRISTINEL CIPRIAN     Focsani, jud. VRANCEA, ROMANIA       NaN   
3      STANCIU NEDEA PETRISOR   Jilavele, jud. IALOMITA, ROMANIA       NaN   
4               DRAGOI STEFAN  Aiudul de Sus, jud. ALBA, ROMANIA       NaN   

   Longitude  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  


In [11]:
print("Unique person addresses:", df_person["Address"].nunique())
print("Unique address full addresses:", df_address["Full address"].nunique())
print("Unique matches:", df_person["Address"].isin(df_address["Full address"]).sum())


Unique person addresses: 1189
Unique address full addresses: 1332
Unique matches: 3564


In [12]:
# Check how many people per address
df_counts = df_merged.groupby("Full address")["ID_person"].nunique().reset_index()
df_counts = df_counts.sort_values("ID_person", ascending=False)

print(df_counts.head(10))


                        Full address  ID_person
912     Sector 3, BUCURESTI, ROMANIA        209
911     Sector 2, BUCURESTI, ROMANIA        131
914     Sector 5, BUCURESTI, ROMANIA        130
910     Sector 1, BUCURESTI, ROMANIA         84
913     Sector 4, BUCURESTI, ROMANIA         74
1052  Timisoara, jud. TIMIS, ROMANIA         73
915     Sector 6, BUCURESTI, ROMANIA         70
501         Iasi, jud. IASI, ROMANIA         65
145     Brasov, jud. BRASOV, ROMANIA         61
142     Braila, jud. BRAILA, ROMANIA         52
