In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

sys.path.append(os.path.abspath(".."))
fig_dir = '../outputs/figures2'
os.makedirs(fig_dir, exist_ok=True)

BASE_DIR = Path().resolve().parent

DATA_DIR = BASE_DIR / "data"
INTERIM_DIR = DATA_DIR / "interim"
PROCESSED_DIR = DATA_DIR / "processed"

#LOG_INT_PATH = INTERIM_DIR / "log_int.pkl"
#MART_INT_PATH = INTERIM_DIR / "mart_int.pkl"
FINAL_DATASET2_PATH = PROCESSED_DIR / "tps_mart_log2.csv"

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False  



tps_mart_log2 = pd.read_csv(FINAL_DATASET2_PATH, 
                       sep = ",", 
                       encoding = 'utf-8', 
                       engine="python",
                       on_bad_lines="skip")
tps_mart_log2.info()

In [None]:
num_cols = ['INHOME_RATE','TOTAL_USED_DAYS','CH_HH_AVG_MONTH1']
summary = tps_mart_log2[num_cols].describe()
print("수치형 컬럼 요약 통계:\n", summary)

In [None]:
num_stats = pd.DataFrame({
    'Q1(25%)': tps_mart_log2[num_cols].quantile(0.25),
    'Q2(50%)': tps_mart_log2[num_cols].quantile(0.50),
    'Q3(75%)': tps_mart_log2[num_cols].quantile(0.75),
    'Min': tps_mart_log2[num_cols].min(),
    'Max': tps_mart_log2[num_cols].max()
})
print(num_stats.T)  


In [None]:
plt.figure(figsize=(12, 4 * len(num_cols)))  

for i, col in enumerate(num_cols):
    plt.subplot(len(num_cols), 1, i+1)  # 행, 열, 위치
    sns.boxplot(x=tps_mart_log2[col], orient='h', color='skyblue')
    plt.title(f'Boxplot of {col}', fontsize=12)
    plt.xlabel('Value')
    plt.ylabel('')  # y축은 필요없음
    # 이상치 값 표시
    for patch in plt.gca().artists:
        patch.set_edgecolor('black')

plt.tight_layout()
plt.show()


In [None]:
import numpy as np

num_cols_log = ['INHOME_RATE','TOTAL_USED_DAYS','CH_HH_AVG_MONTH1']

# 로그 변환 (0이 포함된 경우 +1)
tps_mart_log =tps_mart_log2[num_cols_log].apply(lambda x: np.log1p(x))

plt.figure(figsize=(15, 6))
sns.boxplot(data=tps_mart_log, orient='h')
plt.title('Boxplot of Numeric Columns (log1p scale)')
plt.xlabel('Log(Value + 1)')
plt.ylabel('Columns')
plt.show()
