In [101]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point
from scipy.spatial import cKDTree
from pyproj import Proj, transform
from pyproj import Transformer
from tqdm import tqdm

def transform_5179_to_4326(x, y):
    # Transformer 객체 생성
    transformer = Transformer.from_crs("epsg:5179", "epsg:4326", always_xy=True)
    # 좌표 변환
    lon, lat = transformer.transform(x, y)
    return lat, lon

def transform_2097_to_4326(x, y):
    # Transformer 객체 생성
    transformer = Transformer.from_crs("epsg:2097", "epsg:4326", always_xy=True)
    # 좌표 변환
    lon, lat = transformer.transform(x, y)
    return lat, lon

def transform_2097_to_5179(x, y):
    # Transformer 객체 생성
    transformer = Transformer.from_crs("epsg:2097", "epsg:5179", always_xy=True)
    # 좌표 변환
    lon, lat = transformer.transform(x, y)
    return lat, lon

def transform_3857_to_4326(x, y):
    # Transformer 객체 생성
    transformer = Transformer.from_crs("epsg:3857", "epsg:4326", always_xy=True)
    # 좌표 변환
    lon, lat = transformer.transform(x, y)
    return lat, lon



# 1. 격자 구조 인구밀도 데이터 -> 중심점 계산
def calculate_centroids(grid_df):
    grid_df['centroid'] = grid_df.geometry.centroid
    return grid_df

# 2. 좌표 체계 맞추기
def align_crs(grid_df, shops_gdf):
    # 격자 데이터 CRS 설정 (EPSG:5181 사용)
    if grid_df.crs is None:
        grid_df = grid_df.set_crs(epsg=5181, allow_override=True)
    
    # 상점 데이터 CRS 설정 (EPSG:4326 사용)
    if shops_gdf.crs is None:
        shops_gdf = shops_gdf.set_crs(epsg=5181, allow_override=True)
    
    # 상점 데이터를 격자 데이터의 CRS로 변환
    shops_gdf = shops_gdf.to_crs(grid_df.crs)
    
    return grid_df, shops_gdf


# 3. 격자 중심점과 상점 위치 간의 거리 계산
def calculate_accessibility(grid_df, shops_gdf, buffer_distance):
    grid_df['accessibility'] = 0
    
    # grid_df 중심점과 shops_gdf의 좌표를 numpy 배열로 변환
    centroids = np.array([[point.x, point.y] for point in grid_df.centroid])
    shop_coords = np.array([[point.x, point.y] for point in shops_gdf.geometry])
    
    print(f"Centroids shape: {centroids.shape}")
    print(f"Shop coordinates shape: {shop_coords.shape}")
    
    # array가 비어있는지 확인
    if centroids.shape[0] == 0 or shop_coords.shape[0] == 0:
        raise ValueError("Centroid or shop coordinates array is empty")
    
    # KD-Tree를 사용하여 상점 위치에 대한 거리 계산
    tree = cKDTree(shop_coords)
    
    # 각각의 인구 밀도 중심점에 대해, 근접한 상점 개수 계산
    for centroid, index in zip(centroids, grid_df.index):
        indices = tree.query_ball_point(centroid, buffer_distance)
        grid_df.at[index, 'accessibility'] = len(indices)
    
    return grid_df

def calculate_ppr_for_shops(grid_df, shops_gdf, buffer_distance):
    shops_gdf['PPR'] = 0
    
    # grid_df 중심점과 shops_gdf의 좌표를 numpy 배열로 변환
    centroids = np.array([[point.x, point.y] for point in grid_df['centroid']])
    shop_coords = np.array([[point.x, point.y] for point in shops_gdf.geometry])
    
    # KD-Tree를 사용하여 인구 위치에 대한 거리 계산
    tree = cKDTree(centroids)
    
    # 각각의 상점 위치에 대해, 근접한 격자의 인구 밀도 합계 계산
    for shop_idx, shop in shops_gdf.iterrows():
        indices = tree.query_ball_point([shop.geometry.x, shop.geometry.y], buffer_distance)
        ppr_value = grid_df.iloc[indices]['val'].sum()
        shops_gdf.at[shop_idx, 'PPR'] = 1/ppr_value
    
    return shops_gdf

def calculate_accessibility_for_grids(grid_df, shops_gdf, buffer_distance):
    grid_df['accessibility'] = 0
    
    # grid_df 중심점과 shops_gdf의 좌표를 numpy 배열로 변환
    centroids = np.array([[point.x, point.y] for point in grid_df.centroid])
    shop_coords = np.array([[point.x, point.y] for point in shops_gdf.geometry])
    shop_ppr_values = shops_gdf['PPR'].values
    
    # KD-Tree를 사용하여 상점 위치에 대한 거리 계산
    tree = cKDTree(shop_coords)
    
    # 각각의 인구 밀도(중심점)에 대해, buffer distance 내에 있는 상점들의 PPR 합계 계산
    for grid_idx, centroid in enumerate(centroids):
        indices = tree.query_ball_point(centroid, buffer_distance)
        accessibility_value = shop_ppr_values[indices].sum()
        grid_df.at[grid_idx, 'accessibility'] = accessibility_value
    
    return grid_df

In [102]:
import os

current_path = os.getcwd()
data_path = os.path.join(current_path, 'data')

# 데이터 로드 및 처리 예시
grid_file = os.path.join(data_path, '(B100)국토통계_인구정보-총 인구 수(전체)-(격자) 1KM_서울특별시_202404/nlsp_020001001.shp')  # 격자 데이터 파일 경로
shops_file = os.path.join(data_path,'서울시 대규모점포 인허가 정보.shp')  # 상점 데이터 파일 경로

#  분석에 사용할 버퍼 거리 (단위: 미터)
# walk_speed = 133.1
# buffer_distance = 5 * walk_speed / 100 * 60 # 미터 단위
buffer_distance = 1000

# 격자 및 상점 데이터 불러오기
grid_df = gpd.read_file(grid_file)
shops_gdf = gpd.read_file(shops_file, encoding='ISO-8859-1')

# 좌표계 변환
grid_df = grid_df.to_crs(epsg=5179)
shops_gdf = shops_gdf.to_crs(epsg=5179)


# column 명 변환
x_name = shops_gdf.columns[23]
y_name = shops_gdf.columns[24]
shops_gdf = shops_gdf.rename(columns={x_name: 'x', y_name: 'y'})
shops_gdf['geometry'] = shops_gdf.apply(lambda x: Point((float(x.x), float(x.y))), axis=1)

In [103]:
# 결측치 제거
shops_gdf = shops_gdf.dropna(subset=['x', 'y'])
grid_df = grid_df.dropna(subset=['lbl'])
grid_df = grid_df[grid_df['val'] != 0]
grid_df = grid_df[grid_df.geometry.notnull()]

grid_df.shape
# grid_df

(578, 4)

In [4]:
tqdm.pandas()

shops_gdf[['x', 'y']] = shops_gdf['geometry'].progress_apply(lambda geom: pd.Series(transform_2097_to_5179(geom.centroid.x, geom.centroid.y)))
shops_gdf['geometry'] = shops_gdf.apply(lambda x: Point((float(x.y), float(x.x))), axis=1)
# shops_gdf

100%|██████████| 922/922 [00:08<00:00, 113.75it/s]


In [5]:
# 격자 중심점 계산
grid_df = calculate_centroids(grid_df)


In [6]:
# 접근성 분석
grid_df = calculate_accessibility(grid_df, shops_gdf, buffer_distance)

# 결과 출력
print(grid_df.head())

Centroids shape: (578, 2)
Shop coordinates shape: (922, 2)
          gid       lbl      val  \
0  ë¤ì¬6453   8975.00   8975.0   
1  ë¤ì¬5651  12141.00  12141.0   
2  ë¤ì¬6157  28042.00  28042.0   
3  ë¤ì¬5858  22857.00  22857.0   
4  ë¤ì¬5347  12053.00  12053.0   

                                            geometry                centroid  \
0  POLYGON ((964000 1953000, 964000 1954000, 9650...  POINT (964500 1953500)   
1  POLYGON ((956000 1951000, 956000 1952000, 9570...  POINT (956500 1951500)   
2  POLYGON ((961000 1957000, 961000 1958000, 9620...  POINT (961500 1957500)   
3  POLYGON ((958000 1958000, 958000 1959000, 9590...  POINT (958500 1958500)   
4  POLYGON ((953000 1947000, 953000 1948000, 9540...  POINT (953500 1947500)   

   accessibility  
0              0  
1             36  
2              7  
3              4  
4              5  


In [7]:
# 4. 상점별 PPR 값 계산 (2SFCA 기법 - 1단계)
shops_gdf = calculate_ppr_for_shops(grid_df, shops_gdf, buffer_distance)

print(shops_gdf['PPR'])
print(shops_gdf.shape)

  shops_gdf.at[shop_idx, 'PPR'] = 1/ppr_value


0       0.000032
1       0.000105
2       0.000010
3       0.000013
4       0.000017
          ...   
999     0.000011
1000    0.000026
1001    0.000010
1002    0.000012
1003    0.000009
Name: PPR, Length: 922, dtype: float64
(922, 28)


In [8]:
# 5. 격자별 접근성 지수 계산 (2SFCA 기법 - 2단계)
# 격자별 접근성 지수 계산
grid_df = calculate_accessibility_for_grids(grid_df, shops_gdf, buffer_distance)

  grid_df.at[grid_idx, 'accessibility'] = accessibility_value


In [9]:
for idx, row in grid_df.iterrows():
    if row['accessibility'] != 0:
        print(f"격자 {row['lbl']}의 접근성 지수: {row['accessibility']}")

격자 12141.00의 접근성 지수: 0.0009882039831569907
격자 28042.00의 접근성 지수: 9.008501014204781e-05
격자 22857.00의 접근성 지수: 4.4161785932100954e-05
격자 12053.00의 접근성 지수: 0.00013487776194810546
격자 19321.00의 접근성 지수: 4.482898096140687e-05
격자 903.00의 접근성 지수: 0.00023432108589890172
격자 28864.00의 접근성 지수: 2.927283797253774e-05
격자 6816.00의 접근성 지수: 1.911826559094559e-05
격자 17218.00의 접근성 지수: 1.3097576948264571e-05
격자 12216.00의 접근성 지수: 7.856403409089225e-05
격자 9484.00의 접근성 지수: 0.0003051918397181634
격자 10977.00의 접근성 지수: 0.00022592977572662772
격자 24048.00의 접근성 지수: 5.8323463635154944e-05
격자 13988.00의 접근성 지수: 6.285843829446038e-05
격자 24014.00의 접근성 지수: 0.0001620543993950481
격자 35205.00의 접근성 지수: 0.00010496573549508264
격자 12726.00의 접근성 지수: 3.974508742433678e-05
격자 24287.00의 접근성 지수: 3.038622881490332e-05
격자 29129.00의 접근성 지수: 0.00021886337776891844
격자 22698.00의 접근성 지수: 7.072832587418955e-05
격자 14337.00의 접근성 지수: 2.9370267633324795e-05
격자 16077.00의 접근성 지수: 0.00010980687253298759
격자 16621.00의 접근성 지수: 0.0001190754029819499
격자 13

In [10]:
grid_df.dropna(subset=['geometry'], inplace=True)
grid_df.shape

(578, 6)

### 서울시 동 경계 데이터

In [11]:
geojson_file = 'EMD_Seoul.geojson'
gdf = gpd.read_file(geojson_file)
gdf.head()

Unnamed: 0,BASE_DATE,ADM_DR_CD,ADM_DR_NM,OBJECTID,geometry
0,20200630,1101053,사직동,1,"MULTIPOLYGON (((126.97399 37.57823, 126.974 37..."
1,20200630,1101054,삼청동,2,"MULTIPOLYGON (((126.97714 37.59768, 126.9773 3..."
2,20200630,1101055,부암동,3,"MULTIPOLYGON (((126.96173 37.60714, 126.96182 ..."
3,20200630,1101056,평창동,4,"MULTIPOLYGON (((126.97509 37.63118, 126.97488 ..."
4,20200630,1101057,무악동,5,"MULTIPOLYGON (((126.95975 37.58001, 126.96006 ..."


In [12]:
# 좌표계 변환 
grid_df, gdf = align_crs(grid_df, gdf)

In [13]:
# 포인트가 속한 행정동 경계 찾기
def find_admin_dong(point, gdf):
    for idx, row in gdf.iterrows():
        if row['geometry'].contains(point):  # point가 폴리곤 내부에 있는지 확인
            return row['ADM_DR_NM']  # 행정동 이름 반환
    return None  # 행정동 경계에 포함되지 않으면 None을 반환

grid_df['District_Name'] = grid_df.apply(lambda row: find_admin_dong(row['centroid'], gdf), axis=1)

print(grid_df['District_Name'].head(10))

0    면목7동
1     장충동
2    월계1동
3     송중동
4    한강로동
5    월곡2동
6     옥수동
7     강일동
8     창5동
9     광장동
Name: District_Name, dtype: object


In [14]:
grid_df.head(15)

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name
0,ë¤ì¬6453,8975.0,8975.0,"POLYGON ((964000 1953000, 964000 1954000, 9650...",POINT (964500 1953500),0.0,면목7동
1,ë¤ì¬5651,12141.0,12141.0,"POLYGON ((956000 1951000, 956000 1952000, 9570...",POINT (956500 1951500),0.000988,장충동
2,ë¤ì¬6157,28042.0,28042.0,"POLYGON ((961000 1957000, 961000 1958000, 9620...",POINT (961500 1957500),9e-05,월계1동
3,ë¤ì¬5858,22857.0,22857.0,"POLYGON ((958000 1958000, 958000 1959000, 9590...",POINT (958500 1958500),4.4e-05,송중동
4,ë¤ì¬5347,12053.0,12053.0,"POLYGON ((953000 1947000, 953000 1948000, 9540...",POINT (953500 1947500),0.000135,한강로동
5,ë¤ì¬5955,19321.0,19321.0,"POLYGON ((959000 1955000, 959000 1956000, 9600...",POINT (959500 1955500),4.5e-05,월곡2동
6,ë¤ì¬5748,903.0,903.0,"POLYGON ((957000 1948000, 957000 1949000, 9580...",POINT (957500 1948500),0.000234,옥수동
7,ë¤ì¬7152,15448.0,15448.0,"POLYGON ((971000 1952000, 971000 1953000, 9720...",POINT (971500 1952500),0.0,강일동
8,ë¤ì¬5961,28864.0,28864.0,"POLYGON ((959000 1961000, 959000 1962000, 9600...",POINT (959500 1961500),2.9e-05,창5동
9,ë¤ì¬6549,6816.0,6816.0,"POLYGON ((965000 1949000, 965000 1950000, 9660...",POINT (965500 1949500),1.9e-05,광장동


* 행정 구역 내에 속하지 않는 경계 지역 (29개)는 분석에서 제외.

In [15]:
for idx, row in grid_df.iterrows():
    if pd.isna(row['District_Name']):
        print(f"Missing geometry is {row['centroid']}")
        print(f"Missing district name for grid {idx}")

Missing geometry is POINT (950500 1962500)
Missing district name for grid 50
Missing geometry is POINT (966500 1940500)
Missing district name for grid 54
Missing geometry is POINT (968500 1941500)
Missing district name for grid 101
Missing geometry is POINT (960500 1965500)
Missing district name for grid 119
Missing geometry is POINT (944500 1942500)
Missing district name for grid 120
Missing geometry is POINT (971500 1949500)
Missing district name for grid 146
Missing geometry is POINT (942500 1942500)
Missing district name for grid 162
Missing geometry is POINT (939500 1941500)
Missing district name for grid 175
Missing geometry is POINT (947500 1958500)
Missing district name for grid 191
Missing geometry is POINT (939500 1942500)
Missing district name for grid 269
Missing geometry is POINT (949500 1961500)
Missing district name for grid 277
Missing geometry is POINT (956500 1939500)
Missing district name for grid 300
Missing geometry is POINT (959500 1966500)
Missing district name f

In [16]:
pd.isna(grid_df['District_Name']).sum()

29

In [17]:
grid_df = grid_df.dropna(subset=['District_Name'])
grid_df.shape

(549, 7)

### 서울 상권분석 (소비 매출) 데이터

In [18]:
sales_path = os.path.join(data_path, '서울시 상권분석서비스(소득소비-행정동).csv')
sales_data = pd.read_csv(sales_path, encoding='cp949')
# sales_data['Sales'] = None

# 2024-1분기 데이터만 사용
sales_data = sales_data[sales_data['기준_년분기_코드'] == 20241]

In [19]:
sales_data.head(5)

Unnamed: 0,기준_년분기_코드,행정동_코드,행정동_코드_명,월_평균_소득_금액,소득_구간_코드,지출_총금액,식료품_지출_총금액,의류_신발_지출_총금액,생활용품_지출_총금액,의료비_지출_총금액,교통_지출_총금액,교육_지출_총금액,유흥_지출_총금액,여가_문화_지출_총금액,기타_지출_총금액,음식_지출_총금액
1700,20241,11110680,창신2동,2112817,5,378781000,114626000,2615000,3824000,54984000,1872000,1625000,6167000,8797000,7498000,176773000
1701,20241,11110670,창신1동,2393308,5,885535000,150365000,56693000,1192000,227077000,754000,29247000,35303000,92980000,49143000,242781000
1702,20241,11170700,보광동,2545653,6,378073000,117293000,7390000,6950000,49139000,3287000,14383000,18357000,13263000,13958000,134053000
1703,20241,11140650,신당5동,3027979,7,916706000,164793000,22144000,13424000,135532000,38454000,4674000,55450000,107647000,17265000,357323000
1704,20241,11170510,후암동,3013677,7,732920000,308335000,12720000,10397000,36686000,11974000,30917000,7239000,22807000,28991000,262854000


In [20]:
for idx, row in grid_df.iterrows():
    district_name = row['District_Name']
    if district_name in sales_data['행정동_코드_명'].values:
        grid_df.at[idx, 'Sales'] = sales_data[sales_data['행정동_코드_명']== district_name]['소득_구간_코드'].iloc[0]
    
    else:
        grid_df.at[idx, 'Sales'] = 0

In [21]:
grid_df['Sales'].head()
print((grid_df['Sales'] == 0).sum()) # 모든 행정 구역 동 소득 구간 정보 확인.

13


In [22]:
grid_df

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales
0,ë¤ì¬6453,8975.00,8975.0,"POLYGON ((964000 1953000, 964000 1954000, 9650...",POINT (964500 1953500),0.000000,면목7동,6.0
1,ë¤ì¬5651,12141.00,12141.0,"POLYGON ((956000 1951000, 956000 1952000, 9570...",POINT (956500 1951500),0.000988,장충동,6.0
2,ë¤ì¬6157,28042.00,28042.0,"POLYGON ((961000 1957000, 961000 1958000, 9620...",POINT (961500 1957500),0.000090,월계1동,6.0
3,ë¤ì¬5858,22857.00,22857.0,"POLYGON ((958000 1958000, 958000 1959000, 9590...",POINT (958500 1958500),0.000044,송중동,6.0
4,ë¤ì¬5347,12053.00,12053.0,"POLYGON ((953000 1947000, 953000 1948000, 9540...",POINT (953500 1947500),0.000135,한강로동,9.0
...,...,...,...,...,...,...,...,...
604,ë¤ì¬4549,12779.00,12779.0,"POLYGON ((945000 1949000, 945000 1950000, 9460...",POINT (945500 1949500),0.000000,목5동,9.0
605,ë¤ì¬5142,40229.00,40229.0,"POLYGON ((951000 1942000, 951000 1943000, 9520...",POINT (951500 1942500),0.000000,청룡동,6.0
606,ë¤ì¬5548,11214.00,11214.0,"POLYGON ((955000 1948000, 955000 1949000, 9560...",POINT (955500 1948500),0.000000,이태원1동,7.0
607,ë¤ì¬5950,12661.00,12661.0,"POLYGON ((959000 1950000, 959000 1951000, 9600...",POINT (959500 1950500),0.000000,사근동,7.0


* 행정 구역이 종로 1,2,3,4 동 으로 묶여 있는 곳 존재. 해당 경우, sales data 존재하지 않음.

In [23]:
for _, row in grid_df.iterrows():
    if row['Sales'] == 0:
        grid_df = grid_df.drop(index=_, axis=0)
        print(f"Missing sales data for {row['District_Name']}")
        
grid_df.shape

Missing sales data for 상계3·4동
Missing sales data for 종로5·6가동
Missing sales data for 종로1·2·3·4가동
Missing sales data for 상계3·4동
Missing sales data for 종로1·2·3·4가동
Missing sales data for 상계6·7동
Missing sales data for 상계3·4동
Missing sales data for 면목3·8동
Missing sales data for 상계3·4동
Missing sales data for 중계2·3동
Missing sales data for 상계3·4동
Missing sales data for 종로1·2·3·4가동
Missing sales data for 금호2·3가동


(536, 8)

### 서울 평균 연령 데이터

In [24]:
age_path = os.path.join(data_path, '평균연령_20240726021229.csv')
age_data = pd.read_csv(age_path, encoding='utf-8')
age_data.tail()

Unnamed: 0,동별(1),동별(2),동별(3),2024 2/4,2024 2/4.1,2024 2/4.2
448,합계,강동구,천호2동,46.2,45.7,46.6
449,합계,강동구,길동,46.2,45.5,46.9
450,합계,강동구,강일동,43.1,41.7,44.3
451,합계,강동구,상일1동,40.2,39.8,40.5
452,합계,강동구,상일2동,40.7,39.6,41.6


In [25]:
for idx, row in grid_df.iterrows():
    district_name = row['District_Name']
    if district_name in age_data['동별(3)'].values:
        grid_df.at[idx, 'Average_age'] = age_data[age_data['동별(3)']== district_name]['2024 2/4'].iloc[0]
    
    else:
        grid_df.at[idx, 'Average_age'] = 0

In [26]:
grid_df['Average_age'].head()
print((grid_df['Average_age'] == 0).sum()) # 모든 행정 구역 동 평균 연령 정보 확인.

3


In [27]:
for _, row in grid_df.iterrows():
    if row['Average_age'] == 0:
        grid_df = grid_df.drop(index=_, axis=0)
        print(f"Missing Average_age data for {row['District_Name']}")
        
grid_df.shape

Missing Average_age data for 상일동
Missing Average_age data for 상일동
Missing Average_age data for 일원2동


(533, 9)

### Nearest large store 계산
* 인구 밀도 데이터의 각 위치에 대해 가장 가까운 상권 찾기! 

In [28]:
shops_gdf.head()
grid_df.head(5)

# 좌표계 5179(미터 단위 평면 좌표계) -> 4326(위도/경도) 좌표계로 변환
shops_gdf = shops_gdf.to_crs(epsg=4326)
grid_df = grid_df.to_crs(epsg=4326)
grid_df = calculate_centroids(grid_df)
# grid_df

# shops_gdf['geometry'] = shops_gdf.apply(lambda x: Point((float(x.y), float(x.x))), axis=1)
shops_gdf.head()


  grid_df['centroid'] = grid_df.geometry.centroid


Unnamed: 0,ê°ë°©ìì,ê´ë¦¬ë²í,ì¸íê°ì,ì¸íê°_1,ìììí,ììì_1,ìì¸ìì,ìì¸ì_1,íìì¼ì,í´ììì,...,ì¬ìì¥ë,ìµì¢ìì,ë°ì´í°ê,ë°ì´í_1,ìíêµ¬ë,x,y,ì í¬êµ¬ë,geometry,PPR
0,3170000,2.013317e+18,2013-09-06,,1,ìì/ì ì,1,ì ììì,,,...,ë§ë¦¬ì¤ìì¸ë  1ê´,2023/02/28 20:56:47.000,U,2022/12/03 00:03:00.000,ì¼íì¼í°,1942285.0,945553.391408,,POINT (126.8842 37.47819),3.2e-05
1,3170000,2.012317e+18,2012-09-13,,1,ìì/ì ì,1,ì ììì,,,...,ë§ë¦¬ì¤ìì¸ë  3ê´,2023/02/28 21:05:56.000,U,2022/12/03 00:03:00.000,ì¼íì¼í°,1942315.0,945370.100165,,POINT (126.88213 37.47846),0.000105
2,3140000,2.012314e+18,2012-04-05,,1,ìì/ì ì,1,ì ììì,,,...,ë¡¯ë°ìí¼ ëª©ë2ì ,2023/03/09 09:25:54.000,U,2022/12/02 23:01:00.000,êµ¬ë¶ìì,1948525.0,944344.700766,,POINT (126.87006 37.53436),1e-05
3,3180000,2.001318e+18,1999-05-31,,1,ìì/ì ì,1,ì ììì,,,...,ìë±í¬ì íµìê°,2023/03/09 15:28:54.000,U,2022/12/02 23:01:00.000,ê·¸ ë°ì ëê·ëª¨ì í¬,1947194.0,946191.860377,,POINT (126.89106 37.52248),1.3e-05
4,3210000,2.012321e+18,2012-04-24,,1,ìì/ì ì,1,ì ììì,,,...,ë¡¯ë°ìí¼ ë°©ë°°2ì ,2023/03/13 17:40:32.000,U,2022/12/02 23:05:00.000,ëíë§í¸,1943681.0,954598.673411,,POINT (126.98641 37.49127),1.7e-05


* 오류 : shops_gdf의 gemoetry 변환 시, 좌표계가 초기화 됨! -->

In [29]:
# shops_gdf['geometry'] = shops_gdf.apply(lambda x: Point((float(x.y), float(x.x))), axis=1)
# grid_df['centroid'] = grid_df.apply(lambda x: Point((float(x.centroid.y), float(x.centroid.x))), axis=1)

# shops_gdf.head()

In [30]:
from geopy.distance import geodesic

# 새로운 컬럼을 추가할 빈 데이터프레임 생성
grid_df['Nearest_dist'] = float('inf')  # 초기값을 무한대로 설정

# 인구 밀도 데이터의 각 위치에 대해 가장 가까운 상권 찾기
for _, row in tqdm(grid_df.iterrows(), total=grid_df.shape[0]):
 
    point = row['centroid']
    point_coords = (point.y, point.x) 
    
    # 각 상권 위치와 인구 밀도 위치 간의 거리 계산
    for idx, store_row in shops_gdf.iterrows():
        store_point = store_row['geometry']
        # Assuming 'geometry' is a shapely Point, extract latitude and longitude
        store_point_coords = (store_point.y, store_point.x)  # (latitude, longitude)
        distance = geodesic(point_coords, store_point_coords).kilometers
        
        # 가장 가까운 상권까지의 거리 업데이트
        if distance < grid_df.loc[_, 'Nearest_dist']:
            grid_df.loc[_, 'Nearest_dist'] = distance
            


100%|██████████| 533/533 [01:11<00:00,  7.44it/s]


In [31]:
grid_df.head(10)

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist
0,ë¤ì¬6453,8975.0,8975.0,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,6.0,47.6,1.204582
1,ë¤ì¬5651,12141.0,12141.0,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.000988,장충동,6.0,43.3,0.418711
2,ë¤ì¬6157,28042.0,28042.0,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),9e-05,월계1동,6.0,46.0,0.251338
3,ë¤ì¬5858,22857.0,22857.0,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),4.4e-05,송중동,6.0,47.6,0.472307
4,ë¤ì¬5347,12053.0,12053.0,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.000135,한강로동,9.0,43.5,0.734863
5,ë¤ì¬5955,19321.0,19321.0,"POLYGON ((127.03557 37.59349, 127.03551 37.602...",POINT (127.0412 37.59802),4.5e-05,월곡2동,7.0,45.5,0.530838
6,ë¤ì¬5748,903.0,903.0,"POLYGON ((127.01332 37.53031, 127.01327 37.539...",POINT (127.01895 37.53484),0.000234,옥수동,8.0,43.9,0.644729
7,ë¤ì¬7152,15448.0,15448.0,"POLYGON ((127.17162 37.56691, 127.17158 37.575...",POINT (127.17726 37.57143),0.0,강일동,7.0,43.1,2.161967
8,ë¤ì¬5961,28864.0,28864.0,"POLYGON ((127.03523 37.64757, 127.03518 37.656...",POINT (127.04087 37.6521),2.9e-05,창5동,7.0,46.5,0.314802
9,ë¤ì¬6549,6816.0,6816.0,"POLYGON ((127.10382 37.53966, 127.10377 37.548...",POINT (127.10946 37.54419),1.9e-05,광장동,9.0,40.9,0.87021


### Average distance to stores 
* 가장 가까운 상권 3개와의 거리 평균

In [32]:
# 새로운 컬럼을 추가할 빈 데이터프레임 생성
grid_df['Average_dist'] = float('inf')  # 초기값을 무한대로 설정

for _, row in tqdm(grid_df.iterrows(), total=grid_df.shape[0]):
 
    point = row['centroid']
    point_coords = (point.y, point.x) # (latitude, longitude) 
    
    distances = []
    # 각 상권 위치와 인구 밀도 위치 간의 거리 계산
    for idx, store_row in shops_gdf.iterrows():
        store_point = store_row['geometry']

        store_point_coords = (store_point.y, store_point.x)  # (latitude, longitude)
        distance = geodesic(point_coords, store_point_coords).kilometers
        distances.append(distance)
        
        # 가까운 순서대로 상권 거리 정렬
    distances.sort()
    avg_distance = sum(distances[:3])/3
    grid_df.loc[_, 'Average_dist'] = avg_distance

100%|██████████| 533/533 [01:07<00:00,  7.90it/s]


In [36]:
grid_df.head(10)

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num
0,ë¤ì¬6453,8975.0,8975.0,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,6.0,47.6,1.204582,1.310634,0
1,ë¤ì¬5651,12141.0,12141.0,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.000988,장충동,6.0,43.3,0.418711,0.418711,0
2,ë¤ì¬6157,28042.0,28042.0,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),9e-05,월계1동,6.0,46.0,0.251338,0.650095,0
3,ë¤ì¬5858,22857.0,22857.0,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),4.4e-05,송중동,6.0,47.6,0.472307,0.563379,0
4,ë¤ì¬5347,12053.0,12053.0,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.000135,한강로동,9.0,43.5,0.734863,0.781898,0
5,ë¤ì¬5955,19321.0,19321.0,"POLYGON ((127.03557 37.59349, 127.03551 37.602...",POINT (127.0412 37.59802),4.5e-05,월곡2동,7.0,45.5,0.530838,0.731857,0
6,ë¤ì¬5748,903.0,903.0,"POLYGON ((127.01332 37.53031, 127.01327 37.539...",POINT (127.01895 37.53484),0.000234,옥수동,8.0,43.9,0.644729,0.753232,0
7,ë¤ì¬7152,15448.0,15448.0,"POLYGON ((127.17162 37.56691, 127.17158 37.575...",POINT (127.17726 37.57143),0.0,강일동,7.0,43.1,2.161967,2.614594,0
8,ë¤ì¬5961,28864.0,28864.0,"POLYGON ((127.03523 37.64757, 127.03518 37.656...",POINT (127.04087 37.6521),2.9e-05,창5동,7.0,46.5,0.314802,0.751128,0
9,ë¤ì¬6549,6816.0,6816.0,"POLYGON ((127.10382 37.53966, 127.10377 37.548...",POINT (127.10946 37.54419),1.9e-05,광장동,9.0,40.9,0.87021,1.107812,0


### Count of stores within 1 km
* 서울시 내 근방 1Km 내에 있는 편의점 개수 
* (서울 시 내 편의점의 개수는 6964개)

### 

In [34]:
store_data = pd.read_csv(os.path.join(data_path, '서울시편의점정보.csv'))
store_data['X'], store_data['Y'] = transform_3857_to_4326(store_data['X'], store_data['Y'])

In [37]:
store_data.tail()

Unnamed: 0,OBJT_ID,FCLTY_TY,FCLTY_CD,FCLTY_NM,ADRES,RN_ADRES,TELNO,CTPRVN_CD,SGG_CD,EMD_CD,X,Y,DATA_YR
6960,64320,편의점,509010,CJ편의점,서울특별시 동작구 신대방동 425,서울특별시 동작구 보라매로5길 20,,11.0,11590.0,11590109.0,37.493,126.924318,2022.0
6961,64321,편의점,509010,GS25,서울특별시 성북구 종암동 70-14,서울특별시 성북구 종암로19길 24,,11.0,11290.0,11290135.0,37.598175,127.033286,2022.0
6962,64322,편의점,509010,GS25,서울특별시 송파구 잠실동 10,서울특별시 송파구 올림픽로 25,,11.0,11710.0,11710101.0,37.516188,127.075928,2022.0
6963,64323,편의점,509010,CU,서울특별시 강남구 세곡동 587,서울특별시 강남구 헌릉로569길 9,,11.0,11680.0,11680111.0,37.466785,127.100762,2022.0
6964,64324,편의점,509010,위드미도림금빛점,서울특별시 영등포구 도림동 234-1,서울특별시 영등포구 도림로113길 17,,11.0,11560.0,11560118.0,37.508813,126.897895,2022.0


In [38]:
grid_df['Store_num'] = int(0)  # 초기값 0개

for _, row in tqdm(grid_df.iterrows(), total=grid_df.shape[0]):
 
    point = row['centroid']
    point_coords = (point.y, point.x) # (latitude, longitude) 
    
    store_num = 0
    # 각 편의점 위치와 인구 밀도 위치 간의 거리 계산
    for idx, store_row in store_data.iterrows():
        store_point_coords = (store_row['X'], store_row['Y'])  # (latitude, longitude)
        distance = geodesic(point_coords, store_point_coords).kilometers
        if distance <=1:
            store_num += 1
    
    grid_df.loc[_, 'Store_num'] = store_num

100%|██████████| 533/533 [07:37<00:00,  1.16it/s]


In [40]:
grid_df.head()

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num
0,ë¤ì¬6453,8975.0,8975.0,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,6.0,47.6,1.204582,1.310634,21
1,ë¤ì¬5651,12141.0,12141.0,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.000988,장충동,6.0,43.3,0.418711,0.418711,84
2,ë¤ì¬6157,28042.0,28042.0,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),9e-05,월계1동,6.0,46.0,0.251338,0.650095,43
3,ë¤ì¬5858,22857.0,22857.0,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),4.4e-05,송중동,6.0,47.6,0.472307,0.563379,50
4,ë¤ì¬5347,12053.0,12053.0,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.000135,한강로동,9.0,43.5,0.734863,0.781898,27


### Accessibility via public transport
* 버스정류장 및 지하철의 접근성(반경 500M 내)

In [43]:
bus_station = pd.read_csv(os.path.join(data_path, '2024년1~4월1일기준_서울시버스정류소위치정보.csv'), encoding='cp949')
bus_station.tail()

Unnamed: 0,STDR_DE,NODE_ID,STTN_NO,STTN_NM,CRDNT_X,CRDNT_Y,STTN_TY,Unnamed: 7
50460,20240401,124000334,25995,우성아파트,127.139338,37.550386,0,
50461,20240401,124000333,25996,우성아파트,127.140046,37.550643,0,
50462,20240401,124000332,25997,조일약국,127.123596,37.53363,0,
50463,20240401,124000331,25998,성내시장,127.125497,37.536155,0,
50464,20240401,124000330,25999,천호우체국.로데오거리,127.127337,37.540343,0,


In [44]:
grid_df['Bus_station_num'] = int(0)  # 초기값 0개

for _, row in tqdm(grid_df.iterrows(), total=grid_df.shape[0]):
 
    point = row['centroid']
    point_coords = (point.y, point.x) # (latitude, longitude) 
    
    bus_num = 0
    # 각 편의점 위치와 인구 밀도 위치 간의 거리 계산
    for idx, bus_row in bus_station.iterrows():
        bus_point_coords = (bus_row['CRDNT_Y'], bus_row['CRDNT_X'])  # (latitude, longitude)
        distance = geodesic(point_coords, bus_point_coords).kilometers
        if distance <=0.5:
            bus_num += 1
    
    grid_df.loc[_, 'Bus_station_num'] = bus_num

100%|██████████| 533/533 [55:18<00:00,  6.23s/it]


In [64]:
# grid_df.to_csv('1st_data.csv', index=False)
grid_df.head()

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num,Bus_station_num,Subway_station_num,total_sum,total_sum_category
0,ë¤ì¬6453,8975.0,0.199884,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,0.25,0.643275,0.295605,0.30769,0.166667,0.057143,0.0,1.920264,Low
1,ë¤ì¬5651,12141.0,0.270458,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.651324,장충동,0.25,0.391813,0.099731,0.08378,0.666667,0.328571,1.0,3.742344,Very High
2,ë¤ì¬6157,28042.0,0.624908,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),0.059375,월계1동,0.25,0.549708,0.058014,0.141867,0.34127,0.657143,0.25,2.932285,High
3,ë¤ì¬5858,22857.0,0.509329,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),0.029107,송중동,0.25,0.643275,0.11309,0.120098,0.396825,0.542857,0.25,2.854581,High
4,ë¤ì¬5347,12053.0,0.268496,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.088898,한강로동,1.0,0.403509,0.17853,0.174955,0.214286,0.057143,0.25,2.635817,Medium


In [46]:
subway_station = pd.read_csv(os.path.join(data_path, '서울교통공사_1_8호선 역사 좌표(위경도) 정보_20231031.csv'), encoding='cp949')
subway_station.tail()

Unnamed: 0,연번,호선,고유역번호(외부역코드),역명,위도,경도,작성일자
271,272,8,2823,남한산성입구,37.451568,127.159845,1996-10-31
272,273,8,2824,단대오거리,37.445057,127.156735,1996-12-28
273,274,8,2825,신흥,37.440952,127.14759,1996-12-28
274,275,8,2826,수진,37.437575,127.140936,1996-12-28
275,276,8,2827,모란,37.433888,127.129921,1996-11-30


In [48]:
grid_df['Subway_station_num'] = int(0)  # 초기값 0개

for _, row in tqdm(grid_df.iterrows(), total=grid_df.shape[0]):
 
    point = row['centroid']
    point_coords = (point.y, point.x) # (latitude, longitude) 
    
    sub_num = 0
    # 각 편의점 위치와 인구 밀도 위치 간의 거리 계산
    for idx, sub_row in subway_station.iterrows():
        sub_point_coords = (sub_row['위도'], sub_row['경도'])  # (latitude, longitude)
        distance = geodesic(point_coords, sub_point_coords).kilometers
        if distance <=0.5:
            sub_num += 1
    
    grid_df.loc[_, 'Subway_station_num'] = sub_num

100%|██████████| 533/533 [00:18<00:00, 28.69it/s]


In [50]:
grid_df.to_csv('Food_desert_raw_data.csv')

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num,Bus_station_num,Subway_station_num
0,ë¤ì¬6453,8975.0,8975.0,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,6.0,47.6,1.204582,1.310634,21,16,0
1,ë¤ì¬5651,12141.0,12141.0,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.000988,장충동,6.0,43.3,0.418711,0.418711,84,92,4
2,ë¤ì¬6157,28042.0,28042.0,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),9e-05,월계1동,6.0,46.0,0.251338,0.650095,43,184,1
3,ë¤ì¬5858,22857.0,22857.0,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),4.4e-05,송중동,6.0,47.6,0.472307,0.563379,50,152,1
4,ë¤ì¬5347,12053.0,12053.0,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.000135,한강로동,9.0,43.5,0.734863,0.781898,27,16,1


### 열 속성 값 정규화
* 0~1 사이 값으로 scale 조정

In [58]:
grid_df['Average_age'] = grid_df['Average_age'].astype(float)

In [60]:
from sklearn.preprocessing import MinMaxScaler

# Instantiate the scaler
scaler = MinMaxScaler()

# Assuming grid_df is your DataFrame and you want to scale all numerical columns
negative_columns = ['Average_age', 'Nearest_dist', 'Average_dist']
positive_columns = ['accessibility', 'Sales', 'Store_num', 'Bus_station_num', 'Subway_station_num']
# Fit and transform the data
for col in positive_columns:
    for idx, row in grid_df.iterrows():
        if row[col] != 0:
            grid_df.at[idx, col] = 1/row[col]


grid_df[negative_columns] = scaler.fit_transform(grid_df[negative_columns])
grid_df[positive_columns] = scaler.fit_transform(grid_df[positive_columns])

grid_df.head()

Unnamed: 0,gid,lbl,val,geometry,centroid,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num,Bus_station_num,Subway_station_num
0,ë¤ì¬6453,8975.0,0.199884,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",POINT (127.09794 37.5802),0.0,면목7동,0.25,0.643275,0.295605,0.30769,0.166667,0.057143,0.0
1,ë¤ì¬5651,12141.0,0.270458,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",POINT (127.00746 37.56183),0.651324,장충동,0.25,0.391813,0.099731,0.08378,0.666667,0.328571,1.0
2,ë¤ì¬6157,28042.0,0.624908,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",POINT (127.06376 37.61613),0.059375,월계1동,0.25,0.549708,0.058014,0.141867,0.34127,0.657143,0.25
3,ë¤ì¬5858,22857.0,0.509329,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",POINT (127.02971 37.62502),0.029107,송중동,0.25,0.643275,0.11309,0.120098,0.396825,0.542857,0.25
4,ë¤ì¬5347,12053.0,0.268496,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",POINT (126.97374 37.52563),0.088898,한강로동,1.0,0.403509,0.17853,0.174955,0.214286,0.057143,0.25


In [98]:
total_columns = ['Average_age', 'Nearest_dist', 'Average_dist', 'accessibility', 'Sales', 'Store_num', 'Bus_station_num', 'Subway_station_num']
grid_df['total_sum'] = grid_df[total_columns].sum(axis=1)
grid_df['total_sum_category'] = pd.cut(grid_df['total_sum'], bins=4, labels=['Low', 'Medium', 'High', 'Very High'])

In [99]:
grid_df.head()

Unnamed: 0,gid,lbl,val,geometry,accessibility,District_Name,Sales,Average_age,Nearest_dist,Average_dist,Store_num,Bus_station_num,Subway_station_num,total_sum,total_sum_category
0,ë¤ì¬6453,8975.0,0.199884,"POLYGON ((127.0923 37.57567, 127.09225 37.5846...",0.0,면목7동,0.25,0.643275,0.295605,0.30769,6.0,17.5,0.0,25.196454,Low
1,ë¤ì¬5651,12141.0,0.270458,"POLYGON ((127.00183 37.5573, 127.00177 37.5663...",0.651324,장충동,0.25,0.391813,0.099731,0.08378,1.5,3.043478,1.0,7.290584,Low
2,ë¤ì¬6157,28042.0,0.624908,"POLYGON ((127.05812 37.61161, 127.05806 37.620...",0.059375,월계1동,0.25,0.549708,0.058014,0.141867,2.930233,1.521739,4.0,10.135844,Low
3,ë¤ì¬5858,22857.0,0.509329,"POLYGON ((127.02407 37.62049, 127.02401 37.629...",0.029107,송중동,0.25,0.643275,0.11309,0.120098,2.52,1.842105,4.0,10.027003,Low
4,ë¤ì¬5347,12053.0,0.268496,"POLYGON ((126.96812 37.5211, 126.96805 37.5301...",0.088898,한강로동,1.0,0.403509,0.17853,0.174955,4.666667,17.5,4.0,28.281055,Low


### Folium map 시각화

In [95]:
import folium
import geopandas as gpd
from folium import LayerControl
from branca.colormap import linear

# grid_df = grid_df.drop(columns=['centroid'])
geojson_data = grid_df.to_json()

# Create a base map
m = folium.Map(location=[37.57567, 127.0923], zoom_start=12)

# Define the color scale for the choropleth map
color_scale = linear.YlGnBu_09.scale(grid_df['total_sum'].min(), grid_df['total_sum'].max())
color_scale.caption = 'Total Sum Intensity'

# Function to apply color scale
def style_function(feature):
    total_sum = feature['properties']['total_sum']
    return {
        'fillColor': color_scale(total_sum),
        'color': 'black',
        'weight': 0.5,
        'fillOpacity': 0.5,
        'lineOpacity': 0.1
    }

# Add GeoJson layer
folium.GeoJson(
    geojson_data,
    style_function=style_function,
    name='choropleth',
    tooltip=folium.GeoJsonTooltip(fields=['val', 'total_sum'], aliases=['ID', 'Total Sum'])
).add_to(m)

# Add color scale to the map
color_scale.add_to(m)

# Add layer control
LayerControl().add_to(m)

# Save or display the map
m.save('map.html')  # 지도 저장
m  # Jupyter Notebook에서 지도 표시