In [None]:
#!pip install pandas pygeocodio


In [None]:
import pandas as pd
import json
import logging
from geocodio import GeocodioClient, GeocodioError

In [None]:
# Logging to track geocoding batches and errors
logging.basicConfig(
    filename='geocoding_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [None]:
# Inputting CSV File
input_csv = "input_addresses.csv"  # Replace with your actual CSV file path
df = pd.read_csv(input_csv)ly
df.head()

In [None]:
# Ensuring the CSV contains the required columns
required_columns = ['FID', 'ID', 'address', 'city', 'state', 'zip']
for col in required_columns:
    if col not in df.columns:
        logging.error(f"Missing column: {col}")
        raise ValueError(f"Input CSV must contain the '{col}' column")

print("All required columns are present.")

In [None]:
# Combining address, city, state, and zip into a full address string
df['full_address'] = df['address'] + ', ' + df['city'] + ', ' + df['state'] + ' ' + df['zip'].astype(str)
df.head()

In [None]:
# Initializing the Geocodio client with API key
api_key = "your_geocodio_api_key"  # Replace with the actual Geocodio API key
client = GeocodioClient(api_key)

In [None]:
# Function to perform batch geocoding with error handling and logging
def batch_geocode(addresses, start_index=0, batch_size=10000):
    geocode_results = []
    for i in range(start_index, len(addresses), batch_size):
        batch = addresses[i:i + batch_size]
        batch_number = i // batch_size + 1
        try:
            logging.info(f"Processing batch {batch_number} (records {i + 1} to {i + len(batch)})")
            # Perform geocoding with census fields
            geocode_batch_results = client.geocode(batch, fields=["census2000", "census2010", "census2020"])
            logging.info(f"Successfully geocoded batch {batch_number} (records {i + 1} to {i + len(batch)})")
            geocode_results.extend(geocode_batch_results['results'])
        except GeocodioError as e:
            logging.error(f"Error during geocoding batch {batch_number} (records {i + 1} to {i + len(batch)}): {str(e)}")
            raise
    return geocode_results

In [None]:
# Extract the addresses into a list for geocoding
addresses = df['full_address'].tolist()

# Perform batch geocoding
geocode_results = batch_geocode(addresses)

In [None]:
# Creating a custom result structure that includes original fields and geocoded data
processed_results = []
for j, result in enumerate(geocode_results):
    record = {
        'FID': df.iloc[j]['FID'],
        'ID': df.iloc[j]['ID'],
        'address': df.iloc[j]['address'],
        'city': df.iloc[j]['city'],
        'state': df.iloc[j]['state'],
        'zip': df.iloc[j]['zip'],
        'geocoded_address': result['formatted_address'],
        'latitude': result['location']['lat'],
        'longitude': result['location']['lng'],
        'accuracy': result.get('accuracy'),
        'census2000': result.get('fields', {}).get('census2000', {}),
        'census2010': result.get('fields', {}).get('census2010', {}),
        'census2020': result.get('fields', {}).get('census2020', {})
    }
    processed_results.append(record)

In [None]:
# Saving the geocoded results to a JSON file
output_json = "output_geocoded_results.json"
with open(output_json, 'w') as json_file:
    json.dump(processed_results, json_file, indent=4)

print(f"Geocoding complete. Results saved to {output_json}")

In [None]:
# Inspecting results
processed_results[:5]
