In [None]:
# 기본적인 부분
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc("font", family="Malgun Gothic")
plt.rcParams["axes.unicode_minus"]=False

# 데이터 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# 학습 알고리즘
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report
from scipy.special import expit, softmax

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

from sklearn.ensemble import RandomForestClassifier

In [None]:
sales_df = pd.read_csv("../data/서울시 상권분석서비스(추정매출-상권).csv", encoding="cp949")
work_df = pd.read_csv("../data/서울시 상권분석서비스(직장인구-상권).csv", encoding="cp949")
street_df = pd.read_csv("../data/서울시 상권분석서비스(길단위인구-상권).csv", encoding="cp949")

In [None]:
seoul_restaurant_sales = sales_df[sales_df["서비스_업종_코드"].str.startswith("CS1")].copy()  # 요식업
seoul_etc_sales = sales_df[sales_df["서비스_업종_코드"].str.startswith("CS2")].copy()  # 기타서비스업
seoul_retail_sales = sales_df[sales_df["서비스_업종_코드"].str.startswith("CS3")].copy()  # 소매업

In [None]:
# 평균 매출 + 인구 병합 함수
def process_sales_data(df, category_name):
    df['서비스업종'] = category_name
    mean_sales = df.groupby(['상권_코드_명', '기준_년분기_코드'])[
        ['당월_매출_금액', '월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액',
         '목요일_매출_금액', '금요일_매출_금액', '토요일_매출_금액', '일요일_매출_금액']
    ].mean().reset_index()
    mean_sales.rename(columns={'당월_매출_금액': '평균매출'}, inplace=True)
    merged = pd.merge(mean_sales, street_df, on=['상권_코드_명', '기준_년분기_코드'], how='left')
    merged = pd.merge(merged, work_df, on=['상권_코드_명', '기준_년분기_코드'], how='left')
    merged['서비스업종'] = category_name
    return merged

# 각 대분류별 데이터 병합
restaurant_merged = process_sales_data(seoul_restaurant_sales, '요식업')
etc_merged = process_sales_data(seoul_etc_sales, '기타서비스업')
retail_merged = process_sales_data(seoul_retail_sales, '소매업')

# 전체 통합
final_df = pd.concat([restaurant_merged, etc_merged, retail_merged], ignore_index=True)

In [None]:
# 상권 구분 코드명 병합 전 중복 제거
if '상권_구분_코드_명' in final_df.columns:
    final_df = final_df.drop(columns=['상권_구분_코드_명'])

# 병합할 상권-구분 매핑 정보 추출
area_code_map = sales_df[['상권_코드_명', '상권_구분_코드_명']].drop_duplicates()

# 병합 수행
final_df = pd.merge(final_df, area_code_map, on='상권_코드_명', how='left')

# 컬럼 정리
base_cols = ['기준_년분기_코드', '상권_구분_코드_명', '상권_코드_명', '서비스업종', '평균매출']
weekday_cols = ['월요일_매출_금액', '화요일_매출_금액', '수요일_매출_금액',
                '목요일_매출_금액', '금요일_매출_금액', '토요일_매출_금액', '일요일_매출_금액']
population_cols = [col for col in final_df.columns
                   if col not in base_cols + weekday_cols
                   and '매출' not in col and '코드' not in col]

final_df = final_df[base_cols + weekday_cols + population_cols]

In [None]:
final_df.info()

In [None]:
final_df.to_csv("최종_상권_업종_인구_통합.csv", index=False, encoding="cp949")

In [None]:
final_df.columns

In [None]:
final_df_copy=final_df.copy()

In [None]:
final_df_copy.columns

In [None]:
final_df_copy

In [None]:
final_df_copy["초년_유동인구_수"] = (final_df_copy["연령대_10_유동인구_수"] + final_df_copy["연령대_20_유동인구_수"])
final_df_copy["중년_유동인구_수"] = (final_df_copy["연령대_30_유동인구_수"] + final_df_copy["연령대_40_유동인구_수"])
final_df_copy["노년_유동인구_수"] = (final_df_copy["연령대_50_유동인구_수"] + final_df_copy["연령대_60_이상_유동인구_수"])

final_df_copy["초년_직장_인구_수"] = (final_df_copy["연령대_10_직장_인구_수"] + final_df_copy["연령대_20_직장_인구_수"])
final_df_copy["중년_직장_인구_수"] = (final_df_copy["연령대_30_직장_인구_수"] + final_df_copy["연령대_40_직장_인구_수"])
final_df_copy["노년_직장_인구_수"] = (final_df_copy["연령대_50_직장_인구_수"] + final_df_copy["연령대_60_이상_직장_인구_수"])

#"상권_구분_코드_x", "상권_구분_코드_y", "상권_코드_x", "상권_코드_y", "상권_구분_코드_명_y",

columns_to_drop=["월요일_유동인구_수", "화요일_유동인구_수", "수요일_유동인구_수", "목요일_유동인구_수", 
                 "금요일_유동인구_수", "토요일_유동인구_수", "일요일_유동인구_수", 
                 "연령대_10_유동인구_수", "연령대_20_유동인구_수", "연령대_30_유동인구_수", 
                 "연령대_40_유동인구_수", "연령대_50_유동인구_수", "연령대_60_이상_유동인구_수", 
                 "연령대_10_직장_인구_수", "연령대_20_직장_인구_수", "연령대_30_직장_인구_수", 
                 "연령대_40_직장_인구_수", "연령대_50_직장_인구_수", "연령대_60_이상_직장_인구_수", 
                 "시간대_00_06_유동인구_수", "시간대_06_11_유동인구_수", "시간대_11_14_유동인구_수", 
                 "시간대_14_17_유동인구_수", "시간대_17_21_유동인구_수", "시간대_21_24_유동인구_수", 
                 "남성연령대_10_직장_인구_수", "남성연령대_20_직장_인구_수", "남성연령대_30_직장_인구_수", 
                 "남성연령대_40_직장_인구_수", "남성연령대_50_직장_인구_수", "남성연령대_60_이상_직장_인구_수", 
                 "여성연령대_10_직장_인구_수", "여성연령대_20_직장_인구_수", "여성연령대_30_직장_인구_수", 
                 "여성연령대_40_직장_인구_수", "여성연령대_50_직장_인구_수", "여성연령대_60_이상_직장_인구_수"]

final_df_copy_drop = final_df_copy.drop(columns=columns_to_drop)
#final_df_copy_drop.columns = final_df_copy_drop.columns.str.replace("_x", "")
final_df_copy_drop.info()

In [None]:
final_df_copy_drop.columns

In [None]:
# final_df_copy_drop=final_df_copy_drop[["기준_년분기_코드", "상권_구분_코드_명", "상권_코드_명","서비스업종","평균매출",
#                                         "월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액", "목요일_매출_금액",
#                                         "금요일_매출_금액", "토요일_매출_금액", "일요일_매출_금액",
#                                         "총_유동인구_수", "남성_유동인구_수", "여성_유동인구_수", 
#                                         "초년_유동인구_수", "중년_유동인구_수", "노년_유동인구_수", 
#                                         "총_직장_인구_수", "남성_직장_인구_수", "여성_직장_인구_수",
#                                         "초년_직장_인구_수", "중년_직장_인구_수", "노년_직장_인구_수"]]

In [None]:
final_df_copy_drop.head()

In [None]:
missing_rows = final_df_copy_drop[final_df_copy_drop.isnull().any(axis=1)]
missing_rows

In [None]:
final_df_copy_drop_copy=final_df_copy_drop.copy()

In [None]:
# 유동인구, 직장인구 컬럼만 따로 선택
cols_to_fill = final_df_copy_drop_copy.columns[final_df_copy_drop_copy.isnull().any()]

# 중간값으로 결측치 대체
for col in cols_to_fill:
    median_value = final_df_copy_drop_copy[col].median()
    final_df_copy_drop_copy[col] = final_df_copy_drop_copy[col].fillna(median_value)

# 저장
final_df_copy_drop_copy.to_csv("최종_상권_업종_인구_통합_결측처리.csv", index=False, encoding="cp949")

In [None]:
final_df_copy_drop_copy.info()

In [None]:
final_df_copy_drop_copy.columns

In [None]:
final_df_copy_drop_copy.isna().sum()

In [None]:
final_df_copy_drop_copy = pd.get_dummies(final_df_copy_drop_copy, columns=["상권_구분_코드_명"], drop_first=False)
#final_df_copy_drop_copy = pd.get_dummies(final_df_copy_drop_copy, columns=["서비스업종"], drop_first=False)
final_df_copy_drop_copy.info()

In [None]:
seoul_final_df=final_df_copy_drop_copy.copy()

In [None]:
seoul_final_df

In [None]:
restaurant_filtered_data = seoul_final_df[seoul_final_df['서비스업종'] == '요식업']
etc_filtered_data = seoul_final_df[seoul_final_df['서비스업종'] == '기타서비스업']
retail_filtered_data = seoul_final_df[seoul_final_df['서비스업종'] == '소매업']

In [None]:
restaurant_numeric = restaurant_filtered_data.select_dtypes(include=["float64", "int64", "bool"])
restaurant_correlation_matrix = restaurant_numeric.corr().abs()
restaurant_correlation_matrix

In [None]:
restaurant_correlation_mat = restaurant_numeric.corr()["평균매출"].abs()
restaurant_correlation_mat_sorted = restaurant_correlation_mat.sort_values(ascending=False)
restaurant_correlation_mat_sorted

In [None]:
etc_numeric = etc_filtered_data.select_dtypes(include=["float64", "int64", "bool"])
etc_correlation_matrix = etc_numeric.corr().abs()
etc_correlation_matrix

In [None]:
etc_correlation_mat = etc_numeric.corr()["평균매출"].abs()
etc_correlation_mat_sorted = etc_correlation_mat.sort_values(ascending=False)
etc_correlation_mat_sorted

In [None]:
retail_numeric = retail_filtered_data.select_dtypes(include=["float64", "int64", "bool"])
retail_correlation_matrix = retail_numeric.corr().abs()
retail_correlation_matrix

In [None]:
retail_correlation_mat = retail_numeric.corr()["평균매출"].abs()
retail_correlation_mat_sorted = retail_correlation_mat.sort_values(ascending=False)
retail_correlation_mat_sorted

각 서비스업종별로 확인해보니, 요식업만 상관관계가 좀 있다.  
heatmap으로 보면,  

In [None]:
plt.figure(figsize=(18, 12))
sns.heatmap(restaurant_correlation_matrix, annot=True)
plt.show()

In [None]:
restaurant_filtered_data.to_csv("서울시_요식업_데이터.csv", encoding="cp949", index=False)

## 시각화 및 분석

In [None]:
seoul_df=pd.read_csv("서울시_요식업_데이터.csv", encoding="cp949")
seoul_df

In [None]:
seoul_df.info()

In [None]:
seoul_df.describe().T

In [None]:
seoul_df_corr=seoul_df.select_dtypes(["float64", "int64", "bool"]).corr()
seoul_df_corr["평균매출"].abs().sort_values(ascending=False)

In [None]:
idx=seoul_df_corr[0:23].index
idx

In [None]:
seoul_corr=seoul_df[idx]
seoul_corr.info()

In [None]:
seoul_corr.describe().T.astype(int)

In [None]:
seoul_corr.columns

In [None]:
col=['금요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '월요일_매출_금액', '화요일_매출_금액',
       '토요일_매출_금액', '일요일_매출_금액', '총_직장_인구_수', '음식점_한식음식점', '총_유동인구_수',
       '상권_구분_코드_명_발달상권', '상권_구분_코드_명_골목상권', '상권_구분_코드_명_관광특구']