In [None]:
# 기본적인 부분
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc("font", family="Malgun Gothic")
plt.rcParams["axes.unicode_minus"]=False

# 데이터 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# 학습 알고리즘
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report
from scipy.special import expit, softmax

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

from sklearn.ensemble import RandomForestClassifier

In [None]:
seoul_sales=pd.read_csv("../data/서울시 상권분석서비스(추정매출-상권).csv", encoding="cp949")
seoul_sales.shape

In [None]:
seoul_sales.head()

In [None]:
seoul_sales_copy=seoul_sales.copy()

In [None]:
seoul_restaurant_sales = seoul_sales_copy[seoul_sales_copy["서비스_업종_코드"].str.startswith("CS1")]
seoul_restaurant_sales.head()

In [None]:
seoul_street_people=pd.read_csv("../data/서울시 상권분석서비스(길단위인구-상권).csv", encoding="cp949")
seoul_street_people.shape

In [None]:
seoul_street_people.head()

In [None]:
seoul_working_people=pd.read_csv("../data/서울시 상권분석서비스(직장인구-상권).csv", encoding="cp949")
seoul_working_people.shape

In [None]:
seoul_working_people.head()

In [None]:
seoul_restaurant_sales["서비스_업종_코드_명"].unique()

In [None]:
seoul_restaurant_list = seoul_restaurant_sales["서비스_업종_코드_명"].unique()

results_with_working_population = {}

for restaurant in seoul_restaurant_list:
    temp_df = seoul_restaurant_sales[seoul_restaurant_sales["서비스_업종_코드_명"] == restaurant]
    
    # 평균매출
    mean_sales = temp_df.groupby(["상권_코드_명", "기준_년분기_코드"])[["당월_매출_금액", "월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액",
                                                            "목요일_매출_금액", "금요일_매출_금액", "토요일_매출_금액", "일요일_매출_금액"]].mean().reset_index()
    mean_sales.rename(columns={"당월_매출_금액": "평균매출"}, inplace=True)
    
    # 유동인구 데이터와 병합
    merged = pd.merge(mean_sales, seoul_street_people, on=["상권_코드_명", "기준_년분기_코드"], how="left")
    # 직장인구 데이터와 병합
    merged_with_working = pd.merge(merged, seoul_working_people, on=["상권_코드_명", "기준_년분기_코드"], how="left")
    
    # 음식점별로 묶기 (한식음식점 데이터프레임, 중식음식점 데이터프레임, ... )
    results_with_working_population[restaurant] = merged_with_working

In [None]:
results_with_working_population

In [None]:
items=results_with_working_population.items()
items

In [None]:
keys=results_with_working_population.keys()
keys

In [None]:
values=results_with_working_population.values()
values

In [None]:
# 음식점 이름을 인덱스로 설정
for restaurant, df in results_with_working_population.items():
    df["음식점"] = restaurant

seoul_restaurants = pd.concat(results_with_working_population.values(), ignore_index=True)
seoul_restaurants

In [None]:
seoul_restaurants.info()

> 1차 컬럼 정리

In [None]:
columns_to_drop=["상권_구분_코드_y", "상권_구분_코드_명_y", "상권_코드_y",
                 "상권_구분_코드_x", "상권_코드_x",
                 "연령대_10_유동인구_수", "연령대_20_유동인구_수", "연령대_30_유동인구_수", 
                 "연령대_40_유동인구_수", "연령대_50_유동인구_수", "연령대_60_이상_유동인구_수", 
                 "시간대_00_06_유동인구_수", "시간대_06_11_유동인구_수", "시간대_11_14_유동인구_수", 
                 "시간대_14_17_유동인구_수", "시간대_17_21_유동인구_수", "시간대_21_24_유동인구_수", 
                 "연령대_10_직장_인구_수", "연령대_20_직장_인구_수", "연령대_30_직장_인구_수", 
                 "연령대_40_직장_인구_수", "연령대_50_직장_인구_수", "연령대_60_이상_직장_인구_수", 
                 "남성연령대_10_직장_인구_수", "남성연령대_20_직장_인구_수", "남성연령대_30_직장_인구_수", 
                 "남성연령대_40_직장_인구_수", "남성연령대_50_직장_인구_수", "남성연령대_60_이상_직장_인구_수", 
                 "여성연령대_10_직장_인구_수", "여성연령대_20_직장_인구_수", "여성연령대_30_직장_인구_수", 
                 "여성연령대_40_직장_인구_수", "여성연령대_50_직장_인구_수", "여성연령대_60_이상_직장_인구_수"]
                 
seoul_restaurants_drop = seoul_restaurants.drop(columns=columns_to_drop)
seoul_restaurants_drop.info()

In [None]:
seoul_restaurants_drop.columns = seoul_restaurants_drop.columns.str.replace("_x","")
seoul_restaurants_drop.info()

In [None]:
missing_rows = seoul_restaurants_drop[seoul_restaurants_drop.isnull().any(axis=1)]
missing_rows

결측치들 확인해보니 유동인구, 직장인구 등이 없는 부분이 있어서 결측치 존재했음  
제거하기  

In [None]:
final_seoul_restaurants=seoul_restaurants_drop.copy()

In [None]:
final_seoul_restaurants=final_seoul_restaurants.dropna()

In [None]:
final_seoul_restaurants.info()

In [None]:
final_seoul_restaurants.columns

> 2차 column 정리

In [None]:
final_seoul_restaurants_drop = final_seoul_restaurants.drop(["월요일_유동인구_수", "화요일_유동인구_수", "수요일_유동인구_수", "목요일_유동인구_수",
                                                             "금요일_유동인구_수", "토요일_유동인구_수", "일요일_유동인구_수"], axis=1)

final_seoul_restaurants_drop = final_seoul_restaurants_drop[["기준_년분기_코드", "상권_구분_코드_명", "상권_코드_명", "음식점", "평균매출",
                                                             "월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액", "목요일_매출_금액",
                                                             "금요일_매출_금액", "토요일_매출_금액", "일요일_매출_금액",
                                                             "총_유동인구_수", "남성_유동인구_수", "여성_유동인구_수",
                                                             "총_직장_인구_수", "남성_직장_인구_수", "여성_직장_인구_수"]]

In [None]:
final_seoul_restaurants_drop

In [None]:
final_seoul_restaurants_drop.info()

음식점 -> 원핫인코딩

In [None]:
final_seoul_restaurants_drop = pd.get_dummies(final_seoul_restaurants_drop, columns=["음식점"], drop_first=False)
final_seoul_restaurants_drop = pd.get_dummies(final_seoul_restaurants_drop, columns=["상권_구분_코드_명"], drop_first=False)
final_seoul_restaurants_drop.info()

In [None]:
restaurants_final=final_seoul_restaurants_drop.copy()

In [None]:
#bool_columns = restaurants_final.select_dtypes(include=["bool"]).columns
#restaurants_final[bool_columns] = restaurants_final[bool_columns].astype(int)

In [None]:
restaurants_final_numeric = restaurants_final.select_dtypes(include=["float64", "int64", "bool"])

In [None]:
corr_matrix = restaurants_final_numeric.corr()
corr_matrix

In [None]:
plt.figure(figsize=(18, 12))
sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
correlation_matrix = corr_matrix.corr()["평균매출"]
correlation_matrix_sorted = correlation_matrix.sort_values(ascending=False)
correlation_matrix_sorted

In [None]:
restaurants_final.info()

---

In [None]:
sns.histplot(restaurants_final["평균매출"], kde=True)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(18, 10))  # 2행 4열의 서브플롯 (총 8개의 서브플롯)

days_of_week = ["월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액", 
                "목요일_매출_금액", "금요일_매출_금액", "토요일_매출_금액", 
                "일요일_매출_금액"]

for i, day in enumerate(days_of_week):
    row = i // 4
    col = i % 4
    
    sns.histplot(restaurants_final[day], kde=True, ax=axes[row, col])
    axes[row, col].set_title(f"Distribution of {day}")
    axes[row, col].set_xlabel(f"{day}")
    axes[row, col].set_ylabel("Frequency")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="상권_구분_코드_명_발달상권", y="평균매출", data=restaurants_final)
plt.title("Comparison of 평균매출 by 상권 구분 (발달상권)")
plt.xlabel("상권 구분 (발달상권)")
plt.ylabel("평균매출")
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x="음식점_한식음식점", y="평균매출", data=restaurants_final)
plt.title("Comparison of 평균매출 by 음식점 유형 (한식음식점)")
plt.xlabel("음식점 유형 (한식음식점)")
plt.ylabel("평균매출")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# 평균매출 분포
axes[0].set_title("Distribution of 평균매출")
axes[0].set_xlabel("평균매출")
axes[0].set_ylabel("Frequency")

# 월요일 매출 금액 분포
sns.histplot(restaurants_final["월요일_매출_금액"], kde=True, ax=axes[1])
axes[1].set_title("Distribution of 월요일_매출_금액")
axes[1].set_xlabel("월요일_매출_금액")
axes[1].set_ylabel("Frequency")

# 금요일 매출 금액 분포
sns.histplot(restaurants_final["금요일_매출_금액"], kde=True, ax=axes[2])
axes[2].set_title("Distribution of 금요일_매출_금액")
axes[2].set_xlabel("금요일_매출_금액")
axes[2].set_ylabel("Frequency")

# 레이아웃 정리
plt.tight_layout()
plt.show()

X=서비스업종명, 유동인구수,   
Y=당월 매출금액  
훈련 -> 예측  

업종마다의 금액이 나오니까  

X=상권구분, 서비스업종명, 유동인구수  
Y=당월 매출금액  