In [1]:
import pandas as pd

df_person = pd.read_excel("Search_collection_id_9327_schema_Person.xlsx")
df_address = pd.read_excel("Search_collection_id_9327_schema_Address.xlsx")

print("Person columns:", df_person.columns.tolist())
print("Address columns:", df_address.columns.tolist())

print("\nSample Person rows:")
print(df_person.head())

print("\nSample Address rows:")
print(df_address.head())

Person columns: ['ID', 'url', 'collection', 'Name', 'E-Mail', 'Last name', 'Phone', 'Birth date', 'Nationality', 'Address', 'Bureau van Dijk ID', 'Classification', 'Country', 'Country of birth', 'Country of origin', 'Created at', 'D-U-N-S', 'Death date', 'Description', 'Dissolution date', 'Education', 'Ethnicity', 'First name', 'Gender', 'ICIJ ID', 'ID Number', 'INN', 'Incorporation date', 'Jurisdiction', 'Keywords', 'LEI', 'Legal form', 'Matronymic', 'Middle name', 'Modified on', 'Name suffix', 'Notes', 'OGRN', 'OKPO', 'OpenCorporates URL', 'Other name', 'Passport number', 'Patronymic', 'Place of birth', 'Political association', 'Position', 'Previous name', 'Program', 'Publishing source', 'Publishing source URL', 'Registration number', 'Religion', 'Retrieved on', 'SWIFT/BIC', 'Second name', 'Sector', 'Source link', 'Status', 'Summary', 'Tax Number', 'Tax status', 'Title', 'Topics', 'V.A.T. Identifier', 'Weak alias', 'Website', 'Wikidata ID', 'Wikipedia Article']
Address columns: ['ID'

In [2]:
# Number of rows
print("People:", len(df_person))
print("Addresses:", len(df_address))

# Sample addresses from both
print("\nPerson address samples:")
print(df_person['Address'].dropna().head(10))

print("\nAddress full address samples:")
print(df_address['Full address'].dropna().head(10))

# Try fuzzy matching overlap
person_addresses = set(df_person['Address'].dropna().unique())
full_addresses = set(df_address['Full address'].dropna().unique())

matches = person_addresses.intersection(full_addresses)
print(f"\nExact matches found: {len(matches)}")
print(list(matches)[:20])


People: 4472
Addresses: 1332

Person address samples:
1                                               IRAK
2                    Cluj-Napoca, jud. CLUJ, ROMANIA
4                     Focsani, jud. VRANCEA, ROMANIA
5                   Jilavele, jud. IALOMITA, ROMANIA
6                  Aiudul de Sus, jud. ALBA, ROMANIA
7                  Piatra Neamt, jud. NEAMT, ROMANIA
8                 Baia Mare, jud. MARAMURES, ROMANIA
9                  Beica de Jos, jud. MURES, ROMANIA
10                      Braila, jud. BRAILA, ROMANIA
11    Drobeta-Turnu Severin, jud. MEHEDINTI, ROMANIA
Name: Address, dtype: object

Address full address samples:
0            Pacureti, jud. PRAHOVA, ROMANIA
1               Adunati, jud. ARGES, ROMANIA
2                Niuved, jud. BIHOR, ROMANIA
3            Bunea Mare, jud. TIMIS, ROMANIA
4                Uiasca, jud. ARGES, ROMANIA
5             Sacel, jud. MARAMURES, ROMANIA
6           Urechesti, jud. VRANCEA, ROMANIA
7             Nejlovelu, jud. ARGES, ROMA

In [10]:
import pandas as pd

# Load
df_person = pd.read_excel("Search_collection_id_9327_schema_Person.xlsx")
df_address = pd.read_excel("Search_collection_id_9327_schema_Address.xlsx")

# Drop NaNs so we don’t join on empty strings
df_person = df_person.dropna(subset=["Address"])
df_address = df_address.dropna(subset=["Full address"])

# Merge on the shared address text
df_merged = df_person.merge(
    df_address,
    left_on="Address",
    right_on="Full address",
    suffixes=("_person", "_addr")
)

print(f"Merged dataset has {len(df_merged)} rows")

# Check merged sample
print(df_merged[["Name_person", "Address_person", "Latitude", "Longitude"]].head())

Merged dataset has 3564 rows
                  Name_person                     Address_person  Latitude  \
0  KASHAN SALHLADIN ABDUSALAM                               IRAK       NaN   
1                  RAPAS IOAN    Cluj-Napoca, jud. CLUJ, ROMANIA       NaN   
2   FRATILA CRISTINEL CIPRIAN     Focsani, jud. VRANCEA, ROMANIA       NaN   
3      STANCIU NEDEA PETRISOR   Jilavele, jud. IALOMITA, ROMANIA       NaN   
4               DRAGOI STEFAN  Aiudul de Sus, jud. ALBA, ROMANIA       NaN   

   Longitude  
0        NaN  
1        NaN  
2        NaN  
3        NaN  
4        NaN  


In [11]:
print("Unique person addresses:", df_person["Address"].nunique())
print("Unique address full addresses:", df_address["Full address"].nunique())
print("Unique matches:", df_person["Address"].isin(df_address["Full address"]).sum())


Unique person addresses: 1189
Unique address full addresses: 1332
Unique matches: 3564


In [12]:
# Check how many people per address
df_counts = df_merged.groupby("Full address")["ID_person"].nunique().reset_index()
df_counts = df_counts.sort_values("ID_person", ascending=False)

print(df_counts.head(10))


                        Full address  ID_person
912     Sector 3, BUCURESTI, ROMANIA        209
911     Sector 2, BUCURESTI, ROMANIA        131
914     Sector 5, BUCURESTI, ROMANIA        130
910     Sector 1, BUCURESTI, ROMANIA         84
913     Sector 4, BUCURESTI, ROMANIA         74
1052  Timisoara, jud. TIMIS, ROMANIA         73
915     Sector 6, BUCURESTI, ROMANIA         70
501         Iasi, jud. IASI, ROMANIA         65
145     Brasov, jud. BRASOV, ROMANIA         61
142     Braila, jud. BRAILA, ROMANIA         52


In [14]:
import pandas as pd
import googlemaps
import time
from tqdm import tqdm  # progress bar

# Load merged dataframe
# df_merged = pd.read_csv("merged_file.csv")

gmaps = googlemaps.Client(key=open("google_api_key.txt").read().strip())

# Function to geocode a single address
def geocode_address(address):
    try:
        result = gmaps.geocode(address)
        if result:
            loc = result[0]['geometry']['location']
            return loc['lat'], loc['lng']
    except Exception as e:
        print(f"Error geocoding {address}: {e}")
    return None, None

# Ensure latitude/longitude columns exist
if 'Latitude' not in df_merged.columns:
    df_merged['Latitude'] = None
    df_merged['Longitude'] = None

# Geocode with progress bar
for idx, row in tqdm(df_merged.iterrows(), total=len(df_merged), desc="Geocoding addresses"):
    if pd.isna(row['Latitude']):
        lat, lng = geocode_address(row['Address_person'])
        df_merged.at[idx, 'Latitude'] = lat
        df_merged.at[idx, 'Longitude'] = lng
        time.sleep(0.1)  # short delay to avoid rate limits

# Save results
df_merged.to_csv("merged_geocoded.csv", index=False)



eocoding addresses: 100%|████████████████████████████████████████████████████████████████████████████████████████| 3564/3564 [10:17<00:00,  5.77it/s]

In [16]:
import pandas as pd

# Load the geocoded data
df_geo = pd.read_csv("merged_geocoded.csv")

# Quick look at the first few rows
print(df_geo.head())

# Check columns
print(df_geo.columns)

# Optionally, check for missing coordinates
print(df_geo[['Full address', 'Latitude', 'Longitude']].isna().sum())


                                           ID_person  \
0  8ad106dcb7087ba60910a3b74da6618cf6bb9c39.6d653...   
1  8cfeadcdfeb2e9eac13ef34200cf2928f2dc9e87.376c3...   
2  8dbc045c1178c56a1c31bd9aa019c3939bfc8580.e5551...   
3  8df1fa193f627871ac714c98f5608ff346c578b4.83e8e...   
4  8ef7abdb03d474081514c201cc54318b1a61f4ce.3203a...   

                                          url_person  \
0  https://aleph.occrp.org/entities/8ad106dcb7087...   
1  https://aleph.occrp.org/entities/8cfeadcdfeb2e...   
2  https://aleph.occrp.org/entities/8dbc045c1178c...   
3  https://aleph.occrp.org/entities/8df1fa193f627...   
4  https://aleph.occrp.org/entities/8ef7abdb03d47...   

                       collection_person                 Name_person  E-Mail  \
0  Romania — People wanted by the police  KASHAN SALHLADIN ABDUSALAM     NaN   
1  Romania — People wanted by the police                  RAPAS IOAN     NaN   
2  Romania — People wanted by the police   FRATILA CRISTINEL CIPRIAN     NaN   
3  Rom

In [17]:
df_map = df_geo[['Name_person', 'Full address', 'Latitude', 'Longitude']].copy()

# Drop any remaining rows with missing coordinates
df_map = df_map.dropna(subset=['Latitude', 'Longitude'])

print(df_map.head())
print(f"Total points to plot: {len(df_map)}")


                  Name_person                       Full address   Latitude  \
0  KASHAN SALHLADIN ABDUSALAM                               IRAK  33.223191   
1                  RAPAS IOAN    Cluj-Napoca, jud. CLUJ, ROMANIA  46.771210   
2   FRATILA CRISTINEL CIPRIAN     Focsani, jud. VRANCEA, ROMANIA  45.696475   
3      STANCIU NEDEA PETRISOR   Jilavele, jud. IALOMITA, ROMANIA  44.772525   
4               DRAGOI STEFAN  Aiudul de Sus, jud. ALBA, ROMANIA  46.318713   

   Longitude  
0  43.679291  
1  23.623635  
2  27.184043  
3  26.526677  
4  23.684511  
Total points to plot: 3562


In [18]:
#plot the data

import gmplot
import pandas as pd

# Use the cleaned dataset
# df_map should have columns: 'Name_person', 'Full address', 'Latitude', 'Longitude'
# Ensure no missing coordinates
df_map = df_map.dropna(subset=['Latitude', 'Longitude'])

# Center map on Romania (approximate centroid)
lat_center = df_map['Latitude'].mean()
lng_center = df_map['Longitude'].mean()

# Initialize GoogleMapPlotter with your API key
gmap = gmplot.GoogleMapPlotter(
    lat_center,
    lng_center,
    6,  # zoom level
    apikey=open("google_api_key.txt").read().strip()
)

# Add markers
for idx, row in df_map.iterrows():
    gmap.marker(row['Latitude'], row['Longitude'], title=row['Name_person'])

# Save to HTML
gmap.draw("romania_people_wanted_map.html")
print("Saved interactive map as romania_people_wanted_map.html")


Saved interactive map as romania_people_wanted_map.html
