address coding

In [1]:
import pandas as pd
import requests
import time
from typing import List, Dict, Any, Optional, Tuple
import pandas as pd

def geocode_address(address: str) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
    """
    Geocode an address using Nominatim and return specific metadata.
    
    Args:
    address (str): The address string to geocode.
    
    Returns:
    Tuple[Optional[Dict[str, Any]], Optional[str]]: A tuple containing the processed result (or None) and an error message (or None).
    """
    base_url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": address,
        "format": "json",
        "limit": 1,
        "addressdetails": 1,
        "extratags": 1,
    }
    
    try:
        response = requests.get(base_url, params=params, headers={"User-Agent": "YourAppName"})
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
    except requests.RequestException as e:
        return None, f"Error occurred while fetching data: {str(e)}"
    
    if data and isinstance(data, list) and len(data) > 0:
        result = data[0]
        extratags = result.get('extratags') or {}
        address_details = result.get('address') or {}
        processed_result = {
            "Search_Address": address,
            "OSM_ID": f"{result.get('osm_type', 'way')} {result.get('osm_id')}",
            "Name": result.get('name', 'No Name'),
            "Type": f"{result.get('type', 'Unknown')}:{result.get('class', 'Unknown')}",
            "Last_Updated": result.get('timestamp', 'Unknown'),
            "Search_Rank": result.get('place_rank', 'Unknown'),
            "Address_Rank": f"{result.get('place_rank', 'Unknown')} ({result.get('type', 'Unknown')})",
            "Importance": result.get('importance', 0),
            "Coverage": "Polygon" if 'polygon' in result else 'Point',
            "Latitude": result.get('lat', 'Unknown'),
            "Longitude": result.get('lon', 'Unknown'),
            "Height": extratags.get('height', 'Unknown'),
            "Place_Id": result.get('place_id', 'Unknown'),
            "City": address_details.get('city') or address_details.get('town') or address_details.get('village', 'Unknown'),
            "Street": address_details.get('road', 'Unknown'),
            "Postcode": address_details.get('postcode', 'Unknown'),
            "Housenumber": address_details.get('house_number', 'Unknown'),
            "Extra_Tags": ", ".join([f"{k} ({v})" for k, v in extratags.items() if k != 'height'])
        }
        return processed_result, None
    else:
        return None, "No data found for this address"

def geocode_addresses(addresses: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Geocode a list of addresses using Nominatim and return two DataFrames:
    one with successful results and another with failed addresses.
    
    Args:
    addresses (List[str]): A list of address strings to geocode.
    
    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: A tuple containing two DataFrames:
        1. DataFrame with successful geocoding results
        2. DataFrame with failed addresses and error messages
    """
    successful_results = []
    failed_results = []

    for address in addresses:
        result, error_message = geocode_address(address)
        if result:
            successful_results.append(result)
        else:
            failed_results.append({"Search_Address": address, "Error": error_message})
        
        # Be nice to the Nominatim server by waiting between requests
        time.sleep(1)

    return pd.DataFrame(successful_results), pd.DataFrame(failed_results)

In [2]:
addresses = pd.read_csv(r"C:\Users\Jerem\OneDrive\Desktop\hackathon\addresses.csv")
addresses = addresses['Full Address'].to_list()
addresses = addresses
addresses

['58-01 Grand Avenue, Queens NY 11378',
 '1870 Pelham Parkway South, Bronx NY 10461',
 '2011 Mott Ave, Far Rockaway NY 11691',
 '215 East 99th Street, New York NY 10029',
 '23-25 31 Street, Queens NY 11105',
 '1680 Ocean Ave, Brooklyn NY 11230',
 '33-70 Prince Street, Queens NY 11354',
 '2021 Pitkin Avenue, Brooklyn NY 11207',
 '88-24 Merrick Blvd, Jamaica NY 11432',
 '90-11 149 str, Jamaica NY 11435',
 '148-25 89 Ave, Jamaica NY 11435',
 '166-05 89 Ave, Jamaica NY 11432',
 '403 Hewes Street, Brooklyn NY 11211',
 '409 Hewes Street, Brooklyn NY 11211',
 '1882 Grand Concourse, Bronx NY 10457',
 '990 Aldus Street, Bronx NY 10459',
 '996 Aldus Street, Bronx NY 10459',
 '1025 Aldus Street, Bronx NY 10459',
 '1032 Aldus Street, Bronx NY 10459',
 '1010 Bryant Avenue, Bronx NY 10459',
 '1016 Bryant Avenue, Bronx NY 10459',
 '57-67 West 175 Street, Bronx NY 10453',
 '1664 Macombs Road, Bronx NY 10453',
 '1674 MAcombs Road, Bronx NY 10453',
 '1325 Grand Concourse, Bronx NY 10452',
 '115 Marcy Pl

In [3]:
# Example usage:
if __name__ == "__main__":
    address_list = addresses

    successful_df, failed_df = geocode_addresses(address_list)

    # Save results to CSV
    successful_df.to_csv('successful_geocoded_addresses.csv', index=False)
    failed_df.to_csv('failed_geocoded_addresses.csv', index=False)

    print("Geocoding completed.")
    print(f"Successful results saved to 'successful_geocoded_addresses.csv'. Count: {len(successful_df)}")
    print(f"Failed addresses saved to 'failed_geocoded_addresses.csv'. Count: {len(failed_df)}")
    
    print("\nSuccessful Results:")
    print(successful_df)
    
    print("\nFailed Addresses:")
    print(failed_df)

Geocoding completed.
Successful results saved to 'successful_geocoded_addresses.csv'. Count: 26238
Failed addresses saved to 'failed_geocoded_addresses.csv'. Count: 4247

Successful Results:
                                  Search_Address           OSM_ID  \
0            58-01 Grand Avenue, Queens NY 11378    way 280166984   
1      1870 Pelham Parkway South, Bronx NY 10461    way 282171870   
2           2011 Mott Ave, Far Rockaway NY 11691    way 219959090   
3        215 East 99th Street, New York NY 10029     way 46694769   
4              1680 Ocean Ave, Brooklyn NY 11230    way 420339909   
...                                          ...              ...   
26233         80 South Street, New York NY 10038  node 4704311989   
26234        163 Front Street, New York NY 10038    way 278076376   
26235    15 Lisbon Place, Staten Island NY 10306  node 2545347758   
26236          450 Zerega Ave, New York NY 10473    way 281882939   
26237       347-349 Rider Avenue, Bronx NY 10451  

In [None]:
import requests
import time
from typing import List, Dict, Any, Optional
import pandas as pd

def geocode_address(address: str) -> Optional[Dict[str, Any]]:
    """
    Geocode an address using Nominatim and return detailed information.
    
    Args:
    address (str): The address string to geocode.
    
    Returns:
    Optional[Dict[str, Any]]: A dictionary containing detailed metadata if found, None otherwise.
    """
    base_url = "https://nominatim.openstreetmap.org/search"
    params = {
        "q": address,
        "format": "json",
        "limit": 1,
        "addressdetails": 1,
        "extratags": 1,
        "namedetails": 1,
        "polygon_geojson": 1
    }
    
    response = requests.get(base_url, params=params, headers={"User-Agent": "YourAppName"})
    data = response.json()
    
    if data:
        result = data[0]
        processed_result = {
            "OSM_ID": f"{result.get('osm_type', 'way')} {result.get('osm_id')}",
            "Name": result.get('name', 'No Name'),
            "Type": f"{result.get('type')}:{result.get('class')}",
            "Last_Updated": result.get('lastupdate', 'Unknown'),
            "Importance": result.get('importance', 0),
            "Coverage": "Polygon" if result.get('geojson', {}).get('type') == 'Polygon' else 'Point',
            "Centre_Point": f"{result.get('lat')}, {result.get('lon')}",
            "Place_Id": result.get('place_id'),
            "Computed_Postcode": result.get('address', {}).get('postcode', 'Unknown'),
            "Address_Tags": ", ".join([f"{k}: {v}" for k, v in result.get('address', {}).items()]),
            "Extra_Tags": ", ".join([f"{k}: {v}" for k, v in result.get('extratags', {}).items()]),
        }
        
        # Process hierarchical address information
        address_parts = []
        for key, value in result.get('address', {}).items():
            osm_type = next((item['osm_type'] for item in data if item.get('type') == key), '')
            osm_id = next((item['osm_id'] for item in data if item.get('type') == key), '')
            address_parts.append({
                "Local_name": value,
                "Type": key,
                "OSM": f"{osm_type} {osm_id}" if osm_type and osm_id else '',
                "Address_rank": len(result.get('address', {})) - list(result.get('address', {}).keys()).index(key)
            })
        processed_result["Address_Hierarchy"] = address_parts
        
        return processed_result
    else:
        return None

In [6]:
addresses = pd.read_csv(r"C:\Users\Jerem\OneDrive\Desktop\hackathon\addresses.csv")
addresses = addresses['Full Address'].to_list()
addresses = addresses[:2]
addresses

['58-01 Grand Avenue, Queens NY 11378',
 '1870 Pelham Parkway South, Bronx NY 10461']

In [7]:
def geocode_addresses(addresses: List[str]) -> List[Dict[str, Any]]:
    """
    Geocode a list of addresses using Nominatim and return detailed information.
    
    Args:
    addresses (List[str]): A list of address strings to geocode.
    
    Returns:
    List[Dict[str, Any]]: A list of dictionaries containing detailed metadata for each address.
    """
    results = []

    for address in addresses:
        result = geocode_address(address)
        if result:
            results.append(result)
        else:
            results.append({"query": address, "error": "Not found"})
        
        # Be nice to the Nominatim server by waiting between requests
        time.sleep(1)

    return results

# Example usage:
if __name__ == "__main__":
    address_list = [
        "58-01 Grand Avenue, Queens, NY 11378",
        "1600 Amphitheatre Parkway, Mountain View, CA",
        "1 Infinite Loop, Cupertino, CA"
    ]

    geocoded_results = geocode_addresses(address_list)

    # Convert results to DataFrame
    results_df = pd.json_normalize(geocoded_results, 
                                   record_path=['Address_Hierarchy'], 
                                   meta=['OSM_ID', 'Name', 'Type', 'Last_Updated', 'Importance',
                                         'Coverage', 'Centre_Point', 'Place_Id', 'Computed_Postcode',
                                         'Address_Tags', 'Extra_Tags'],
                                   record_prefix='Hierarchy_')

    # Save results to CSV
    results_df.to_csv('detailed_geocoded_addresses.csv', index=False)

    print("Detailed geocoding completed. Results saved to 'detailed_geocoded_addresses.csv'.")

Detailed geocoding completed. Results saved to 'detailed_geocoded_addresses.csv'.


In [None]:
for address in addresses:
    lat, lon = geocode_address(address)
    if lat and lon:
        results.append([address, lat, lon])
    else:
        results.append([address, "Not found", "Not found"])
    
    # Be nice to the Nominatim server by waiting between requests
    time.sleep(1)

# Save results to CSV
with open('geocoded_addresses.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Address', 'Latitude', 'Longitude'])
    writer.writerows(results)

print("Geocoding completed. Results saved to 'geocoded_addresses.csv'.")