In [None]:
# 기본적인 부분
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rc("font", family="Malgun Gothic")
plt.rcParams["axes.unicode_minus"]=False

# 데이터 전처리
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

# 학습 알고리즘
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet

from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error, mean_absolute_error
from sklearn.metrics import classification_report
from scipy.special import expit, softmax

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

from sklearn.ensemble import RandomForestClassifier

# 서울시 요식업 매출 예측

## 데이터 불러오기 및 결합

### 서울시 상권 추정매출  

In [None]:
seoul_sales=pd.read_csv("../data/서울시 상권분석서비스(추정매출-상권).csv", encoding="cp949")
seoul_sales.shape

In [None]:
seoul_sales.info()

In [None]:
seoul_sales

In [None]:
seoul_sales_copy=seoul_sales.copy()

In [None]:
seoul_restaurant_sales = seoul_sales_copy[seoul_sales_copy["서비스_업종_코드"].str.startswith("CS1")]  # 요식업
seoul_restaurant_sales.head()

### 서울시 상권 유동인구

In [None]:
seoul_street_people=pd.read_csv("../data/서울시 상권분석서비스(길단위인구-상권).csv", encoding="cp949")
seoul_street_people.shape

In [None]:
seoul_street_people.info()

In [None]:
seoul_street_people.head()

### 서울시 상권 직장인구

In [None]:
seoul_working_people=pd.read_csv("../data/서울시 상권분석서비스(직장인구-상권).csv", encoding="cp949")
seoul_working_people.shape

In [None]:
seoul_working_people.info()

In [None]:
seoul_working_people.head()

### 데이터 병합
> 서울시 추정매출: 서비스 업종 별로 나누기 -> 상권별로 서비스 업종의 매출관련 정보  
> 서울시 유동인구 및 직장인구: 상권별로 인구관련 정보  

In [None]:
seoul_restaurant_sales["서비스_업종_코드_명"].unique()

In [None]:
seoul_restaurant_list = seoul_restaurant_sales["서비스_업종_코드_명"].unique()

results_with_working_population = {}

for restaurant in seoul_restaurant_list:
    temp_df = seoul_restaurant_sales[seoul_restaurant_sales["서비스_업종_코드_명"] == restaurant]
    
    # 평균매출
    mean_sales = temp_df.groupby(["상권_코드_명", "기준_년분기_코드"])[["당월_매출_금액", "월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액",
                                                            "목요일_매출_금액", "금요일_매출_금액", "토요일_매출_금액", "일요일_매출_금액"]].mean().reset_index()
    mean_sales.rename(columns={"당월_매출_금액": "평균매출"}, inplace=True)
    
    # 유동인구 데이터와 병합
    merged = pd.merge(mean_sales, seoul_street_people, on=["상권_코드_명", "기준_년분기_코드"], how="left")
    # 직장인구 데이터와 병합
    merged_with_working = pd.merge(merged, seoul_working_people, on=["상권_코드_명", "기준_년분기_코드"], how="left")
    
    # 음식점별로 묶기 (한식음식점 데이터프레임, 중식음식점 데이터프레임, ... )
    results_with_working_population[restaurant] = merged_with_working

In [None]:
results_with_working_population

In [None]:
keys=results_with_working_population.keys()
keys

In [None]:
values=results_with_working_population.values()
values

In [None]:
items=results_with_working_population.items()
items

In [None]:
# 음식점 이름을 인덱스로 설정
for restaurant, df in results_with_working_population.items():
    df["음식점"] = restaurant

seoul_restaurants = pd.concat(results_with_working_population.values(), ignore_index=True)
seoul_restaurants

### 1차 컬럼 정리

In [None]:
seoul_restaurants.columns

In [None]:
seoul_restaurants["초년_유동인구_수"] = (seoul_restaurants["연령대_10_유동인구_수"] + seoul_restaurants["연령대_20_유동인구_수"])
seoul_restaurants["중년_유동인구_수"] = (seoul_restaurants["연령대_30_유동인구_수"] + seoul_restaurants["연령대_40_유동인구_수"])
seoul_restaurants["노년_유동인구_수"] = (seoul_restaurants["연령대_50_유동인구_수"] + seoul_restaurants["연령대_60_이상_유동인구_수"])

seoul_restaurants["초년_직장_인구_수"] = (seoul_restaurants["연령대_10_직장_인구_수"] + seoul_restaurants["연령대_20_직장_인구_수"])
seoul_restaurants["중년_직장_인구_수"] = (seoul_restaurants["연령대_30_직장_인구_수"] + seoul_restaurants["연령대_40_직장_인구_수"])
seoul_restaurants["노년_직장_인구_수"] = (seoul_restaurants["연령대_50_직장_인구_수"] + seoul_restaurants["연령대_60_이상_직장_인구_수"])

columns_to_drop=["상권_구분_코드_y", "상권_구분_코드_명_y", "상권_코드_y", 
                 "상권_구분_코드_x", "상권_코드_x", 
                 "월요일_유동인구_수", "화요일_유동인구_수", "수요일_유동인구_수", "목요일_유동인구_수", 
                 "금요일_유동인구_수", "토요일_유동인구_수", "일요일_유동인구_수", 
                 "연령대_10_유동인구_수", "연령대_20_유동인구_수", "연령대_30_유동인구_수", 
                 "연령대_40_유동인구_수", "연령대_50_유동인구_수", "연령대_60_이상_유동인구_수", 
                 "연령대_10_직장_인구_수", "연령대_20_직장_인구_수", "연령대_30_직장_인구_수", 
                 "연령대_40_직장_인구_수", "연령대_50_직장_인구_수", "연령대_60_이상_직장_인구_수", 
                 "시간대_00_06_유동인구_수", "시간대_06_11_유동인구_수", "시간대_11_14_유동인구_수", 
                 "시간대_14_17_유동인구_수", "시간대_17_21_유동인구_수", "시간대_21_24_유동인구_수", 
                 "남성연령대_10_직장_인구_수", "남성연령대_20_직장_인구_수", "남성연령대_30_직장_인구_수", 
                 "남성연령대_40_직장_인구_수", "남성연령대_50_직장_인구_수", "남성연령대_60_이상_직장_인구_수", 
                 "여성연령대_10_직장_인구_수", "여성연령대_20_직장_인구_수", "여성연령대_30_직장_인구_수", 
                 "여성연령대_40_직장_인구_수", "여성연령대_50_직장_인구_수", "여성연령대_60_이상_직장_인구_수"]

seoul_restaurants_drop = seoul_restaurants.drop(columns=columns_to_drop)
seoul_restaurants_drop.columns = seoul_restaurants_drop.columns.str.replace("_x", "")
seoul_restaurants_drop.info()

In [None]:
missing_rows = seoul_restaurants_drop[seoul_restaurants_drop.isnull().any(axis=1)]
missing_rows

결측치들 확인해보니 유동인구, 직장인구 등이 없는 부분이 있어서 결측치 존재했음  
제거하기  

In [None]:
final_seoul_restaurants=seoul_restaurants_drop.copy()

In [None]:
final_seoul_restaurants=final_seoul_restaurants.dropna()
final_seoul_restaurants.info()

> 2차 column 정리

In [None]:
final_seoul_restaurants.columns

In [None]:
final_seoul_restaurants = final_seoul_restaurants[["기준_년분기_코드", "상권_구분_코드_명", "상권_코드_명", "음식점", "평균매출",
                                                   "월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액", "목요일_매출_금액",
                                                   "금요일_매출_금액", "토요일_매출_금액", "일요일_매출_금액",
                                                   "총_유동인구_수", "남성_유동인구_수", "여성_유동인구_수", 
                                                   "초년_유동인구_수", "중년_유동인구_수", "노년_유동인구_수", 
                                                   "총_직장_인구_수", "남성_직장_인구_수", "여성_직장_인구_수",
                                                   "초년_직장_인구_수", "중년_직장_인구_수", "노년_직장_인구_수"]]

In [None]:
final_seoul_restaurants

In [None]:
final_seoul_restaurants.info()

음식점 -> 원핫인코딩

In [None]:
final_seoul_restaurants.columns

In [None]:
final_seoul_restaurants = pd.get_dummies(final_seoul_restaurants, columns=["음식점"], drop_first=False)
final_seoul_restaurants = pd.get_dummies(final_seoul_restaurants, columns=["상권_구분_코드_명"], drop_first=False)
final_seoul_restaurants.info()

In [None]:
final_seoul_restaurants_copy=final_seoul_restaurants.copy()

In [None]:
restaurants_final_numeric = final_seoul_restaurants_copy.select_dtypes(include=["float64", "int64", "bool"])
corr_matrix = restaurants_final_numeric.corr()
corr_matrix

In [None]:
plt.figure(figsize=(18, 12))
sns.heatmap(corr_matrix, annot=True)
plt.show()

In [None]:
correlation_matrix = corr_matrix.corr()["평균매출"].abs()
correlation_matrix_sorted = correlation_matrix.sort_values(ascending=False)
correlation_matrix_sorted

In [None]:
final_seoul_restaurants_copy.to_csv("../data/seoul_sales_data.csv", index=False)

## 시각화 및 분석

In [None]:
seoul=pd.read_csv("../data/seoul_sales_data.csv")
seoul

In [None]:
seoul.describe().T

In [None]:
correlation_matrix_sorted

In [None]:
seoul.columns

In [None]:
seoul_copy=seoul.copy()

In [None]:
columns_to_drop=['기준_년분기_코드', '상권_코드_명', 
                 '남성_유동인구_수', '여성_유동인구_수',
                 '초년_유동인구_수', '중년_유동인구_수', '노년_유동인구_수',
                 '남성_직장_인구_수', '여성_직장_인구_수', 
                 '초년_직장_인구_수', '중년_직장_인구_수', '노년_직장_인구_수']

seoul_copy = seoul_copy.drop(columns=columns_to_drop)

In [None]:
seoul_copy.info()

In [None]:
corr_mat=seoul_copy.corr()
correlation_mat = corr_mat.corr()["평균매출"].abs()
correlation_mat_sorted = correlation_mat.sort_values(ascending=False)
correlation_mat_sorted

In [None]:
idx=correlation_mat_sorted[0:14].index
idx

In [None]:
seoul_corr=seoul[idx]
seoul_corr.info()

In [None]:
seoul_corr.describe().T.astype(int)

In [None]:
seoul_corr.columns

In [None]:
col=['금요일_매출_금액', '수요일_매출_금액', '목요일_매출_금액', '월요일_매출_금액', '화요일_매출_금액',
       '토요일_매출_금액', '일요일_매출_금액', '총_직장_인구_수', '음식점_한식음식점', '총_유동인구_수',
       '상권_구분_코드_명_발달상권', '상권_구분_코드_명_골목상권', '상권_구분_코드_명_관광특구']

In [None]:
sns.pairplot(data=seoul_corr, x_vars=col, y_vars="평균매출")

In [None]:
sns.scatterplot(data=seoul_corr, x="상권_구분_코드_명_발달상권", y="평균매출", label="상권_구분_코드_명_발달상권")
sns.scatterplot(data=seoul_corr, x="상권_구분_코드_명_골목상권", y="평균매출", label="상권_구분_코드_명_골목상권")
sns.scatterplot(data=seoul_corr, x="상권_구분_코드_명_관광특구", y="평균매출", label="상권_구분_코드_명_관광특구")
plt.xlabel(" ")
plt.ylabel("평균매출")

## 모델 훈련 및 평가

In [None]:
X=seoul_corr[col]
Y=seoul_corr["평균매출"]
print(X.shape, type(X))
print(Y.shape, type(Y))

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42)

In [None]:
lr=LinearRegression()
lr.fit(X_train, Y_train)

In [None]:
print("학습: ", lr.score(X_train, Y_train))
print("일반화: ", lr.score(X_test, Y_test))

> Polynomial

In [None]:
poly=PolynomialFeatures(include_bias=False)
poly.fit(X_train)

In [None]:
X_train_poly=poly.transform(X_train)
X_test_poly=poly.transform(X_test)
X_test_poly.shape

In [None]:
scaler=StandardScaler()
scaler.fit(X_train_poly)

X_train_scaled=scaler.transform(X_train_poly)
X_test_scaled=scaler.transform(X_test_poly)

In [None]:
lr=LinearRegression()
lr.fit(X_train_scaled, Y_train)

In [None]:
print("학습: ", lr.score(X_train_scaled, Y_train))
print("일반화: ", lr.score(X_test_scaled, Y_test))

In [None]:
print(lr.coef_, lr.intercept_)

In [None]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=42)
poly=PolynomialFeatures(include_bias=False, degree=5)  # 5차 다항식
poly.fit(X_train)
X_train_poly=poly.transform(X_train)
X_test_poly=poly.transform(X_test)
X_train_poly.shape,  X_test_poly.shape

In [None]:
scaler=StandardScaler()
scaler.fit(X_train_poly)

X_train_scaled=scaler.transform(X_train_poly)
X_test_scaled=scaler.transform(X_test_poly)

In [None]:
lr=LinearRegression()
lr.fit(X_train_scaled, Y_train)

In [None]:
print("학습: ", lr.score(X_train_scaled, Y_train))
print("일반화: ", lr.score(X_test_scaled, Y_test))

> 릿지

In [None]:
from sklearn.linear_model import Ridge

ridg=Ridge()  # 규제값 default: 1.0
ridg.fit(X_train_scaled, Y_train)

print("학습: ", ridg.score(X_train_scaled, Y_train))
print("일반화: ", ridg.score(X_test_scaled, Y_test))

하이퍼파리미터

In [None]:
alpha_list=[0.001, 0.01, 0.1, 1, 10, 100]

for i in alpha_list:
    ridg=Ridge(alpha=i)
    ridg.fit(X_train_scaled, Y_train)
    print(f"{i} 의 경우")
    print("학습: ", ridg.score(X_train_scaled, Y_train))
    print("일반화: ", ridg.score(X_test_scaled, Y_test), "\n")

> 라쏘

In [None]:
lasso=Lasso()  # 규제값(alpha) default: 1.0
lasso.fit(X_train_scaled, Y_train)

print("학습: ", lasso.score(X_train_scaled, Y_train))
print("일반화: ", lasso.score(X_test_scaled, Y_test))

하이퍼파리미터

In [None]:
alpha_list=[0.001, 0.01, 0.1, 1, 10, 100]

for i in alpha_list:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train_scaled, Y_train)
    print(f"{i} 의 경우")
    print("학습: ", lasso.score(X_train_scaled, Y_train))
    print("일반화: ", lasso.score(X_test_scaled, Y_test), "\n")

tol 값은 알고리즘이 반복을 멈출 기준을 설정하는 숫자로,  
작은 tol(0.0001)은 더 정확한 결과를 원한다면, 반복을 더 많이 해야 하므로 시간이 오래 걸린다.  
큰 tol(0.01) 더 빠르게 멈추지만, 결과가 덜 정확할 수 있다.  
따라서 tol은 정확도와 속도를 조절하는 역할을 한다.  

default값인 0.0001이기에 너무 작아서 모델이 잘 학습하지 못하는 경우가 있기 때문에
tol 값을 조금 높여서 지정해주면 된다.  

In [None]:
alpha_list=[0.001, 0.01, 0.1, 1, 10, 100]

for i in alpha_list:
    lasso=Lasso(alpha=i, tol=0.01)
    lasso.fit(X_train_scaled, Y_train)
    print(f"{i} 의 경우")
    print("학습: ", lasso.score(X_train_scaled, Y_train))
    print("일반화: ", lasso.score(X_test_scaled, Y_test), "\n")

> 엘라스틱

In [None]:
from sklearn.linear_model import ElasticNet

elastic_net=ElasticNet()  
elastic_net.fit(X_train_scaled, Y_train)

print("학습: ", elastic_net.score(X_train_scaled, Y_train))
print("일반화: ", elastic_net.score(X_test_scaled, Y_test))

In [None]:
alpha_list=[0.001, 0.01, 0.1, 1, 10, 100]

for i in alpha_list:
    elastic_net=ElasticNet(alpha=i, tol=0.01)
    elastic_net.fit(X_train_scaled, Y_train)
    print(f"{i} 의 경우")
    print("학습: ", elastic_net.score(X_train_scaled, Y_train))
    print("일반화: ", elastic_net.score(X_test_scaled, Y_test), "\n")

---

In [None]:
sns.histplot(restaurants_final["평균매출"], kde=True)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(18, 10))  # 2행 4열의 서브플롯 (총 8개의 서브플롯)

days_of_week = ["월요일_매출_금액", "화요일_매출_금액", "수요일_매출_금액", 
                "목요일_매출_금액", "금요일_매출_금액", "토요일_매출_금액", 
                "일요일_매출_금액"]

for i, day in enumerate(days_of_week):
    row = i // 4
    col = i % 4
    
    sns.histplot(restaurants_final[day], kde=True, ax=axes[row, col])
    axes[row, col].set_title(f"Distribution of {day}")
    axes[row, col].set_xlabel(f"{day}")
    axes[row, col].set_ylabel("Frequency")

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="상권_구분_코드_명_발달상권", y="평균매출", data=restaurants_final)
plt.title("Comparison of 평균매출 by 상권 구분 (발달상권)")
plt.xlabel("상권 구분 (발달상권)")
plt.ylabel("평균매출")
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x="음식점_한식음식점", y="평균매출", data=restaurants_final)
plt.title("Comparison of 평균매출 by 음식점 유형 (한식음식점)")
plt.xlabel("음식점 유형 (한식음식점)")
plt.ylabel("평균매출")
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# 평균매출 분포
axes[0].set_title("Distribution of 평균매출")
axes[0].set_xlabel("평균매출")
axes[0].set_ylabel("Frequency")

# 월요일 매출 금액 분포
sns.histplot(restaurants_final["월요일_매출_금액"], kde=True, ax=axes[1])
axes[1].set_title("Distribution of 월요일_매출_금액")
axes[1].set_xlabel("월요일_매출_금액")
axes[1].set_ylabel("Frequency")

# 금요일 매출 금액 분포
sns.histplot(restaurants_final["금요일_매출_금액"], kde=True, ax=axes[2])
axes[2].set_title("Distribution of 금요일_매출_금액")
axes[2].set_xlabel("금요일_매출_금액")
axes[2].set_ylabel("Frequency")

# 레이아웃 정리
plt.tight_layout()
plt.show()

In [None]:
seoul_restaurants.columns

In [None]:
seoul_restaurants.info()

In [None]:
columns_to_drop=["상권_구분_코드_y", "상권_구분_코드_명_y", "상권_코드_y"]  
seoul_restaurants_drop = seoul_restaurants.drop(columns=columns_to_drop)
seoul_restaurants_drop.info()

In [None]:
seoul_restaurants_drop.columns = seoul_restaurants_drop.columns.str.replace("_x","")
seoul_restaurants_drop.info()

In [None]:
restaurants_final_numeric = seoul_restaurants_drop.select_dtypes(include=["float64", "int64", "bool"])

correlation_matrix = restaurants_final_numeric.corr()["평균매출"]
correlation_matrix_sorted = correlation_matrix.sort_values(ascending=False)
pd.set_option('display.max_rows', None)
correlation_matrix_sorted

X=서비스업종명, 유동인구수,   
Y=당월 매출금액  
훈련 -> 예측  

업종마다의 금액이 나오니까  

X=상권구분, 서비스업종명, 유동인구수  
Y=당월 매출금액  

In [None]:
X =  '상권코드명', '서비스업종코드명',  '총 유동인구',  '요일_월', ... '요일_일'
Y = '당월 매출금액'