In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon, box
from shapely.ops import unary_union
import matplotlib.pyplot as plt
from pyproj import Transformer
import json
import re
from unidecode import unidecode

In [26]:
wi = pd.read_csv('../data/scaled_PCA_weighted.csv')

In [28]:
wi = wi[wi['year']==2019]

In [21]:
ward_code = pd.read_csv('./large_files/ward_code.csv')

def remove_administrative_terms(s):
    administrative_terms = ['\\(Xã\\)','\\(Thịtrấn\\)','Quận', 'Thành Phố', 'Huyện', 'Thị Xã', 'Tỉnh', 'Phường', 'Xã', 'Thị Trấn']
    for term in administrative_terms:
        s = re.sub(f'(?i){term}', '', s, flags=re.UNICODE)
    return s.strip()

def merge_words(s):
    return s.replace(' ','')

dfs = []
for year in ['2016','2019']:
    relevant_year = [col for col in ward_code.columns if col.endswith(year) and (col.startswith('ward') or col.startswith('distname') or col.startswith('provname'))]
    df = ward_code[relevant_year].copy()
    df.loc[:,'year'] = year
    if year == '2019':
        df.columns = ['provname','distname','id','wardname','year']
    else:
        df.columns = ['provname','distname','id','wardname','change','year']    
    dfs.append(df)

ward_code = pd.concat(dfs, ignore_index=True,axis=0)
ward_code.drop_duplicates(inplace=True)
ward_code.dropna(subset=['distname','provname','wardname'],inplace=True)

# remove admin terms, accents and spaces
ward_code['distname'] = ward_code['distname'].apply(remove_administrative_terms).apply(unidecode).apply(merge_words)
ward_code['provname'] = ward_code['provname'].apply(remove_administrative_terms).apply(unidecode).apply(merge_words)
ward_code['wardname'] = ward_code['wardname'].apply(remove_administrative_terms).apply(unidecode).apply(merge_words)

ward_code.drop_duplicates(subset=['provname', 'distname', 'wardname', 'year'], inplace=True)
ward_code['year'] = ward_code['year'].astype(int)
ward_code.drop(columns=['change'], inplace=True)

ward_code = ward_code[ward_code['year'] == 2019]

  ward_code = pd.read_csv('./large_files/ward_code.csv')


In [37]:
wi = wi.merge(ward_code, right_on=['id'], left_on=['maxa'])

In [34]:
import geopandas as gpd
import json
import pandas as pd
from shapely.geometry import shape, Polygon, MultiPolygon
from shapely.ops import unary_union
from pyproj import Transformer

# Load the data from a JSON file
with open('large_files/gadm41_VNM_3.json', encoding='utf-8') as file:
    gadm = json.load(file)

# Normalize the JSON data into a pandas DataFrame
gps = pd.json_normalize(gadm['features'])

# Function to convert nested lists to MultiPolygon
def convert_to_multipolygon(coords, geom_type):
    if geom_type == "MultiPolygon":
        return MultiPolygon([Polygon(poly[0]) for poly in coords])
    elif geom_type == "Polygon":
        return Polygon(coords[0])

# Convert the 'geometry.coordinates' column to geometry using the 'geometry.type'
gps['geometry'] = gps.apply(lambda row: convert_to_multipolygon(row['geometry.coordinates'], row['geometry.type']), axis=1)

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame(gps, geometry='geometry')
gdf.crs = "EPSG:4326"  # Set the coordinate reference system

# Initialize the transformer to convert from EPSG:4326 to WGS 84
transformer = Transformer.from_crs("EPSG:4326", "EPSG:4326", always_xy=True)

# Calculate centroids and extract info
results = []
for idx, row in gdf.iterrows():
    # Calculate the centroid
    centroid = row['geometry'].centroid
    lon, lat = transformer.transform(centroid.x, centroid.y)

    # Append the information to the results list
    results.append({
        'province': row['properties.NAME_1'],
        'district': row['properties.NAME_2'],
        'ward': row['properties.NAME_3'],
        'centroid_longitude': lon,
        'centroid_latitude': lat
    })

# Convert results to DataFrame
centroids_df = pd.DataFrame(results)

  province district       ward  centroid_longitude  centroid_latitude
0  AnGiang    AnPhú      AnPhú          105.092356          10.794695
1  AnGiang    AnPhú    ĐaPhước          105.115281          10.745450
2  AnGiang    AnPhú    KhánhAn          105.106444          10.944850
3  AnGiang    AnPhú  KhánhBình          105.069513          10.926929
4  AnGiang    AnPhú   LongBình          105.090380          10.948529


In [36]:
centroids_df['province'] = centroids_df['province'].apply(remove_administrative_terms).apply(unidecode)
centroids_df['district'] = centroids_df['district'].apply(remove_administrative_terms).apply(unidecode)
centroids_df['ward'] = centroids_df['ward'].apply(remove_administrative_terms).apply(unidecode)

In [38]:
wi

Unnamed: 0,matinh,mahuyen,maxa,year_x,mean_pos_WI,provname,distname,id,wardname,year_y
0,1,1,1,2019,2.934070,HaNoi,BaDinh,1.0,PhucXa,2019
1,1,1,4,2019,2.963528,HaNoi,BaDinh,4.0,TrucBach,2019
2,1,1,6,2019,3.031043,HaNoi,BaDinh,6.0,VinhPhuc,2019
3,1,1,7,2019,3.036249,HaNoi,BaDinh,7.0,CongVi,2019
4,1,1,8,2019,3.025272,HaNoi,BaDinh,8.0,LieuGiai,2019
...,...,...,...,...,...,...,...,...,...,...
11097,96,973,32239,2019,1.746483,CaMau,NgocHien,32239.0,VienAnDong,2019
11098,96,973,32242,2019,2.173314,CaMau,NgocHien,32242.0,VienAn,2019
11099,96,973,32244,2019,2.145584,CaMau,NgocHien,32244.0,RachGoc,2019
11100,96,973,32245,2019,1.728974,CaMau,NgocHien,32245.0,TanAn,2019


In [40]:
final_df = wi.merge(centroids_df, how='left', left_on=['provname','distname','wardname'],right_on=['province', 'district','ward'])

In [41]:
cols = ['provname']
final_df

Unnamed: 0,matinh,mahuyen,maxa,year_x,mean_pos_WI,provname,distname,id,wardname,year_y,province,district,ward,centroid_longitude,centroid_latitude
0,1,1,1,2019,2.934070,HaNoi,BaDinh,1.0,PhucXa,2019,HaNoi,BaDinh,PhucXa,105.849320,21.046953
1,1,1,4,2019,2.963528,HaNoi,BaDinh,4.0,TrucBach,2019,HaNoi,BaDinh,TrucBach,105.841240,21.045697
2,1,1,6,2019,3.031043,HaNoi,BaDinh,6.0,VinhPhuc,2019,HaNoi,BaDinh,VinhPhuc,105.809641,21.042191
3,1,1,7,2019,3.036249,HaNoi,BaDinh,7.0,CongVi,2019,HaNoi,BaDinh,CongVi,105.810314,21.035237
4,1,1,8,2019,3.025272,HaNoi,BaDinh,8.0,LieuGiai,2019,HaNoi,BaDinh,LieuGiai,105.817236,21.038225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11149,96,973,32239,2019,1.746483,CaMau,NgocHien,32239.0,VienAnDong,2019,CaMau,NgocHien,VienAnDong,104.940556,8.645703
11150,96,973,32242,2019,2.173314,CaMau,NgocHien,32242.0,VienAn,2019,CaMau,NgocHien,VienAn,104.846790,8.654407
11151,96,973,32244,2019,2.145584,CaMau,NgocHien,32244.0,RachGoc,2019,CaMau,NgocHien,RachGoc,105.008386,8.637345
11152,96,973,32245,2019,1.728974,CaMau,NgocHien,32245.0,TanAn,2019,CaMau,NgocHien,TanAn,105.086099,8.654519
