In [10]:
import geopandas as gpd
import matplotlib.pyplot as plt

In [11]:
# Load the GeoJSON file
gdf = gpd.read_file('data.geojson')

# Basic information about the dataset
print("Dataset Info:")
gdf.info()



Dataset Info:
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 4 entries, 0 to 3
Columns: 201 entries, OBJECTID to geometry
dtypes: float64(104), geometry(1), int32(96)
memory usage: 4.9 KB


In [None]:
identity_columns = ["ZipCode"]
drop = ["OBJECTID"]
target = []

In [9]:
# Summary statistics of numerical columns
print("\nSummary Statistics:")
gdf.describe()




Summary Statistics:


Unnamed: 0,OBJECTID,ZipCode,Total_population,Male_Total_Population,Female_Total_Population,Percent_Male_Total_population__,Percent_Female_Total_population,Total_population_AGE_Under_5_ye,Total_population_AGE_5_to_9_yea,Total_population_AGE_10_to_14_y,...,Percent_Female_Total_populat_25,Percent_Female_Total_populat_26,Percent_Female_Total_populat_27,Percent_Female_Total_populat_28,Percent_Female_Total_populat_29,Percent_Female_Total_populat_30,Pop_by_Sq_Mile,SquareMiles,Shape__Area,Shape__Length
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,2.5,85282.5,46585.0,24717.25,21867.75,52.588816,47.411184,1957.75,1770.0,1849.75,...,83.7,76.225,19.225,17.475,13.4,5.525,4389.274718,10.25,287539800.0,103118.55607
std,1.290994,1.290994,20640.118168,12014.679865,8924.902478,3.466592,3.466592,740.94731,665.803274,618.674591,...,4.988654,4.727491,7.397916,6.865069,4.910533,2.229163,1344.618598,2.217356,67796690.0,17899.151212
min,1.0,85281.0,19150.0,10129.0,9021.0,48.512211,43.052621,916.0,931.0,946.0,...,79.4,70.8,9.6,8.7,7.3,3.4,2393.75,8.0,218772800.0,79789.789707
25%,1.75,85281.75,39091.75,19174.0,19917.75,51.130097,46.093442,1753.75,1414.75,1764.25,...,79.925,74.25,15.975,14.175,11.05,3.775,4200.346591,8.75,242186200.0,93812.698427
50%,2.5,85282.5,49283.5,24830.5,24453.0,52.447838,47.552162,2131.5,1848.5,2052.0,...,82.65,75.9,20.1,18.2,13.65,5.35,4942.328283,10.0,279537700.0,107037.343828
75%,3.25,85283.25,56776.75,30373.75,26403.0,53.906558,48.869903,2335.5,2203.75,2137.5,...,86.425,77.875,23.35,21.5,16.0,7.1,5131.25641,11.5,324891200.0,116343.20147
max,4.0,85284.0,68623.0,39079.0,29544.0,56.947379,51.487789,2652.0,2452.0,2349.0,...,90.1,82.3,27.1,24.8,19.0,8.0,5278.692308,13.0,372311000.0,118609.746916


In [24]:
import geopandas as gpd
import pandas as pd
import numpy as np

def analyze_geojson(geojson_path, key_column):
    # Load the GeoJSON data into a GeoDataFrame
    gdf = gpd.read_file(geojson_path)
    
    # Print columns for debugging
    print("Columns in the GeoDataFrame:")
    print(gdf.columns)
    print(f"Key column: {key_column}")
    
    # Check if the key column exists in the GeoDataFrame
    if key_column not in gdf.columns:
        raise ValueError(f"Column '{key_column}' not found in the GeoDataFrame.")
    
    # Set the key column as the index if it's not already
    if gdf.index.name != key_column:
        gdf = gdf.set_index(key_column)
    
    # Separate geometry column if it exists
    if 'geometry' in gdf.columns:
        geometry = gdf['geometry']
        gdf = gdf.drop(columns=['geometry'])
    else:
        geometry = None
        print("Warning: No 'geometry' column found in the GeoDataFrame.")
    
    # Identify numeric columns
    numeric_columns = gdf.select_dtypes(include=[np.number]).columns.tolist()
    
    if not numeric_columns:
        raise ValueError("No numeric columns found in the GeoDataFrame.")
    
    # Perform basic analysis
    analysis = {}
    
    # Statistics for numeric columns
    for stat in ['max', 'min', 'mean', 'sum', 'median', 'std']:
        analysis[stat] = getattr(gdf, stat)()[numeric_columns]
    
    # Count of all columns
    analysis['count'] = gdf.count()
    
    # Additional analyses
    analysis['unique_counts'] = gdf.nunique()
    
    # Find the key (e.g., ZipCode) with highest and lowest values for each numeric column
    for col in numeric_columns:
        max_key = gdf[col].idxmax()
        min_key = gdf[col].idxmin()
        analysis[f'{col}_highest'] = f"{key_column}: {max_key} value: ({gdf[col].max()})"
        analysis[f'{col}_lowest'] = f"{key_column}: {min_key} value: ({gdf[col].min()})"
    
    return analysis, geometry

def print_analysis(analysis_results, key_column):
    for stat, result in analysis_results.items():
        if isinstance(result, pd.Series):
            print(f"{stat.capitalize()} values based on '{key_column}':")
            print(result.to_string())
        else:
            print(f"{stat}:")
            print(result)
        print("\n")

def main():
    geojson_path = "data.geojson"
    key_column = 'ZipCode'
    
    try:
        analysis_results, geometry = analyze_geojson(geojson_path, key_column)
        print_analysis(analysis_results, key_column)
        
        # Optional: You can use the geometry for further spatial analysis if needed
        if geometry is not None:
            print(f"Geometry column is available for spatial analysis. Shape: {geometry.shape}")
        else:
            print("No geometry column found in the GeoDataFrame.")
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Columns in the GeoDataFrame:
Index(['OBJECTID', 'ZipCode', 'Total_population', 'Male_Total_Population',
       'Female_Total_Population', 'Percent_Male_Total_population__',
       'Percent_Female_Total_population', 'Total_population_AGE_Under_5_ye',
       'Total_population_AGE_5_to_9_yea', 'Total_population_AGE_10_to_14_y',
       ...
       'Percent_Female_Total_populat_26', 'Percent_Female_Total_populat_27',
       'Percent_Female_Total_populat_28', 'Percent_Female_Total_populat_29',
       'Percent_Female_Total_populat_30', 'Pop_by_Sq_Mile', 'SquareMiles',
       'Shape__Area', 'Shape__Length', 'geometry'],
      dtype='object', length=201)
Key column: ZipCode
Max values based on 'ZipCode':
OBJECTID                           4.000000e+00
Total_population                   6.862300e+04
Male_Total_Population              3.907900e+04
Female_Total_Population            2.954400e+04
Percent_Male_Total_population__    5.694738e+01
Percent_Female_Total_population    5.148779e+01
Total_po