In [None]:
import os
import json
import glob

# set the main folder path
main_folder = "/Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland" 

# merge all the data into one list
all_data = []

# loop through the main folder and its subfolders
for root, dirs, files in os.walk(main_folder):
    # find all the json files
    json_files = glob.glob(os.path.join(root, "*.json"))
    
    # process each json file
    for json_file in json_files:
        print(f"processing: {json_file}")
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                
                # if the data is a list, extend all_data
                if isinstance(data, list):
                    all_data.extend(data)
                # if the data is a dictionary, add to all_data
                elif isinstance(data, dict):
                    all_data.append(data)
        except Exception as e:
            print(f"processing file {json_file} error: {str(e)}")

# output the number of merged data
print(f"total merged data: {len(all_data)}")

# write the merged data to a new file
output_file = "data/combined_data.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=2)

print(f"saved the merged data to {output_file}")

processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland/連江縣/連江縣南竿鄉清水段1058.json
processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland/連江縣/連江縣南竿鄉清水段1363.json
processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland/連江縣/連江縣南竿鄉清水段483.json
processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland/新竹縣/新竹縣新豐鄉忠信段11.json
processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/twland/新竹縣/新竹縣新豐鄉忠信段0018-0000.json
processing: /Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications w

In [5]:
import logging
import pandas as pd

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# read the json file
with open("/Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/combined_data.json", 'r', encoding='utf-8') as f:
    data = json.load(f)

logging.info(f"original data contains {len(data)} collections")

# count the data structure
collection_count = 0
feature_count = 0
no_features_collections = 0
empty_features_collections = 0
collection_types = {}

# analyze the data structure
for i, collection in enumerate(data):
    collection_count += 1
    collection_type = type(collection).__name__
    collection_types[collection_type] = collection_types.get(collection_type, 0) + 1
    
    if isinstance(collection, dict):
        if 'features' not in collection:
            no_features_collections += 1
            logging.warning(f"collection {i} does not contain 'features' field")
        elif len(collection['features']) == 0:
            empty_features_collections += 1
            logging.warning(f"collection {i} 'features' field is an empty list")
        else:
            feature_count += len(collection['features'])
            logging.info(f"collection {i} contains {len(collection['features'])} features")
    elif isinstance(collection, list):
        # if the collection itself is a list
        feature_count += len(collection)
        logging.info(f"collection {i} is a list, contains {len(collection)} items")

# output the analysis results
logging.info(f"total collections: {collection_count}")
logging.info(f"total features: {feature_count}")
logging.info(f"collections without 'features' field: {no_features_collections}")
logging.info(f"collections with 'features' field but empty: {empty_features_collections}")
logging.info(f"collection types: {collection_types}")

# create an empty list to store data
rows = []

# process each collection
for i, collection in enumerate(data):
    # check the collection type
    if isinstance(collection, dict) and 'features' in collection:
        # this is the standard GeoJSON format
        for j, feature in enumerate(collection['features']):
            # 創建基本資料結構
            row = {
                'collection_index': i,
                'feature_index': j,
                'collection_type': 'FeatureCollection'
            }
            
            # add properties
            if 'properties' in feature:
                for key, value in feature['properties'].items():
                    row[key] = value
            
            # add geometry information
            if 'geometry' in feature:
                geometry = feature['geometry']
                row['geometry_type'] = geometry.get('type', None)
                
                # save coordinates
                if 'coordinates' in geometry:
                    row['coordinates'] = json.dumps(geometry['coordinates'])
                    
                    # extract the first coordinate point
                    if geometry.get('type') == 'MultiPolygon' and len(geometry['coordinates']) > 0:
                        try:
                            first_point = geometry['coordinates'][0][0][0]
                            row['x_coord'] = first_point[0]
                            row['y_coord'] = first_point[1]
                        except Exception as e:
                            logging.warning(f"error extracting coordinate points: {str(e)}")
            
            rows.append(row)
    else:
        # non-standard format, save as a row of data
        row = {
            'collection_index': i,
            'collection_type': type(collection).__name__,
            'raw_data': json.dumps(collection)[:1000]  
        }
        rows.append(row)

# create a DataFrame
solar_panel_polygon = pd.DataFrame(rows)

# output the statistics information
print(f"total parsed {len(solar_panel_polygon)} rows of data")
print(f"DataFrame contains {len(solar_panel_polygon.columns)} columns")

# output the column statistics
column_counts = solar_panel_polygon.count()
print("not empty values in each column:")
print(column_counts)

2025-04-13 10:36:37,473 - INFO - original data contains 8543 collections
2025-04-13 10:36:37,473 - INFO - collection 0 contains 1 features
2025-04-13 10:36:37,473 - INFO - collection 1 contains 1 features
2025-04-13 10:36:37,473 - INFO - collection 2 contains 1 features
2025-04-13 10:36:37,474 - INFO - collection 3 contains 1 features
2025-04-13 10:36:37,474 - INFO - collection 4 contains 1 features
2025-04-13 10:36:37,474 - INFO - collection 5 contains 1 features
2025-04-13 10:36:37,474 - INFO - collection 6 contains 1 features
2025-04-13 10:36:37,474 - INFO - collection 7 contains 1 features
2025-04-13 10:36:37,475 - INFO - collection 8 contains 1 features
2025-04-13 10:36:37,475 - INFO - collection 9 contains 1 features
2025-04-13 10:36:37,475 - INFO - collection 10 contains 1 features
2025-04-13 10:36:37,475 - INFO - collection 11 contains 1 features
2025-04-13 10:36:37,476 - INFO - collection 12 contains 1 features
2025-04-13 10:36:37,476 - INFO - collection 13 contains 1 features

total parsed 4962 rows of data
DataFrame contains 24 columns
not empty values in each column:
collection_index    4962
feature_index       4962
collection_type     4962
縣市                  4962
鄉鎮                  4962
地段                  4962
段號                  4962
地號                  4962
id                  4962
ymax                4962
ymin                4962
xmax                4962
xmin                4962
xcenter             4962
ycenter             4962
area_id             4962
section_id          4962
land_id             4962
query_log           4962
query               4962
geometry_type       4962
coordinates         4962
x_coord             4962
y_coord             4962
dtype: int64


In [6]:
solar_panel_polygon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4962 entries, 0 to 4961
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   collection_index  4962 non-null   int64  
 1   feature_index     4962 non-null   int64  
 2   collection_type   4962 non-null   object 
 3   縣市                4962 non-null   object 
 4   鄉鎮                4962 non-null   object 
 5   地段                4962 non-null   object 
 6   段號                4962 non-null   int64  
 7   地號                4962 non-null   int64  
 8   id                4962 non-null   object 
 9   ymax              4962 non-null   float64
 10  ymin              4962 non-null   float64
 11  xmax              4962 non-null   float64
 12  xmin              4962 non-null   float64
 13  xcenter           4962 non-null   float64
 14  ycenter           4962 non-null   float64
 15  area_id           4962 non-null   object 
 16  section_id        4962 non-null   object 


In [7]:
column_mapping = {
    '縣市': 'county_city',
    '鄉鎮': 'township',
    '地段': 'land_section',
    '段號': 'section_number',
    '地號': 'land_number',
    'id': 'id',
    'ymax': 'ymax',
    'ymin': 'ymin',
    'xmax': 'xmax',
    'xmin': 'xmin',
    'xcenter': 'xcenter',
    'ycenter': 'ycenter',
    'area_id': 'area_id',
    'section_id': 'section_id',
    'land_id': 'land_id',
    'query_log': 'query_log',
    'query': 'query',
    'collection_index': 'collection_index',
    'feature_index': 'feature_index',
    'collection_type': 'collection_type',
    'geometry_type': 'geometry_type',
    'coordinates': 'coordinates',
    'x_coord': 'x_coord',
    'y_coord': 'y_coord'
}

solar_panel_polygon = solar_panel_polygon.rename(columns=column_mapping)

solar_panel_polygon

Unnamed: 0,collection_index,feature_index,collection_type,county_city,township,land_section,section_number,land_number,id,ymax,...,ycenter,area_id,section_id,land_id,query_log,query,geometry_type,coordinates,x_coord,y_coord
0,0,0,FeatureCollection,連江縣,南竿鄉,清水段,10580000,10580000,ZA0004,26.155779,...,26.155204,ZA,0004,10580000,[],連江縣南竿鄉清水段1058號,MultiPolygon,"[[[[119.93394180035, 26.155184155318], [119.93...",119.933942,26.155184
1,1,0,FeatureCollection,連江縣,南竿鄉,清水段,13630000,13630000,ZA0004,26.158588,...,26.156859,ZA,0004,13630000,[],連江縣南竿鄉清水段1363號,MultiPolygon,"[[[[119.93465484494, 26.156188787317], [119.93...",119.934655,26.156189
2,2,0,FeatureCollection,連江縣,南竿鄉,清水段,4830000,4830000,ZA0004,26.156571,...,26.155837,ZA,0004,4830000,[],連江縣南竿鄉清水段483號,MultiPolygon,"[[[[119.93461107241, 26.155779060379], [119.93...",119.934611,26.155779
3,3,0,FeatureCollection,新竹縣,新豐鄉,忠信段,110000,110000,JD0423,24.876591,...,24.876207,JD,0423,110000,[],新竹縣新豐鄉忠信段11號,MultiPolygon,"[[[[121.00331397731, 24.875823720232], [121.00...",121.003314,24.875824
4,4,0,FeatureCollection,新竹縣,新豐鄉,忠信段,180000,180000,JD0423,24.878840,...,24.877716,JD,0423,180000,[],新竹縣新豐鄉忠信段0018-0000號,MultiPolygon,"[[[[121.00199753238, 24.878166346873], [121.00...",121.001998,24.878166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4957,8535,0,FeatureCollection,嘉義縣,義竹鄉,龍蛟潭段,6490015,6490015,QB0320,23.372604,...,23.372485,QB,0320,6490015,[],嘉義縣義竹鄉龍蛟潭段649-15號,MultiPolygon,"[[[[120.206637838, 23.372439240895], [120.2069...",120.206638,23.372439
4958,8536,0,FeatureCollection,嘉義縣,義竹鄉,安溪寮段,3350000,3350000,QB0323,23.350283,...,23.350108,QB,0323,3350000,[],嘉義縣義竹鄉安溪寮段335號,MultiPolygon,"[[[[120.20319931114, 23.350276611205], [120.20...",120.203199,23.350277
4959,8538,0,FeatureCollection,嘉義縣,義竹鄉,龍蛟潭段,6600004,6600004,QB0320,23.373531,...,23.373230,QB,0320,6600004,[],嘉義縣義竹鄉龍蛟潭段660-4號,MultiPolygon,"[[[[120.20223165085, 23.372929093429], [120.20...",120.202232,23.372929
4960,8541,0,FeatureCollection,嘉義縣,義竹鄉,西後寮段,3160004,3160004,QB0343,23.377132,...,23.377011,QB,0343,3160004,[],嘉義縣義竹鄉西後寮段316-4號,MultiPolygon,"[[[[120.22093879513, 23.377106345196], [120.22...",120.220939,23.377106


In [9]:
def df_to_geojson(df, output_path):
    features = []
    
    # Only process MultiPolygon type data
    valid_rows = df[df['geometry_type'] == 'MultiPolygon']
    
    for idx, row in valid_rows.iterrows():
        try:
            coords = json.loads(row['coordinates'])
            
            # Create properties dictionary, excluding coordinates and geometry type fields
            properties = {}
            for col in df.columns:
                if col not in ['coordinates', 'geometry_type']:
                    # Convert according to data type
                    val = row[col]
                    if pd.notnull(val):  # Exclude NaN values
                        if pd.api.types.is_numeric_dtype(df[col].dtype):
                            properties[col] = float(val) if 'float' in str(df[col].dtype) else int(val)
                        else:
                            properties[col] = str(val)
            
            # Create GeoJSON feature
            features.append({
                "type": "Feature",
                "geometry": {
                    "type": "MultiPolygon",
                    "coordinates": coords
                },
                "properties": properties
            })
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
    
    # Create complete GeoJSON
    geojson = {
        "type": "FeatureCollection",
        "features": features
    }
    
    # Save file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(geojson, f, ensure_ascii=False, indent=2)
    
    print(f"Successfully converted {len(features)} polygons and saved to {output_path}")
    return geojson

# Specify output file path
output_path = "data/solar_panel_polygons.geojson"

# Convert and save
geojson_data = df_to_geojson(solar_panel_polygon, output_path)

# Display data summary
print(f"GeoJSON file size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
print(f"Number of polygon features: {len(geojson_data['features'])}")

  if pd.notnull(val):  # Exclude NaN values


Successfully converted 4962 polygons and saved to data/solar_panel_polygons.geojson
GeoJSON file size: 10.27 MB
Number of polygon features: 4962


In [17]:
import geopandas as gpd

# 1. use big5 encoding to read shapefile
pop = gpd.read_file(
    "/Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/tainan_pop/tainan_pop.SHP", 
    encoding='big5'
)

pop.set_crs(epsg=3826, inplace=True)
print(pop.crs)

pop = pop.to_crs('EPSG:4326')
print(pop.crs)

pop['pop_density'] = pop['P_CNT'] / (pop['AREA']/1000000)

output_path = "/Users/tsernian/Documents/CASA/CASA0025_Building Spatial Applications with Big Data/CASA00025_Building-Spatial-Applications-solar-panel/data/tainan_pop_wgs84.geojson"
pop.to_file(output_path, driver='GeoJSON')



EPSG:3826
EPSG:4326
