In [67]:
from hye_project.my_package.stat_file import test_normality, print_normality, stat_test, outlier

In [65]:
import hye_project.my_package.stat_file as sf
print(dir(sf))

['Any', 'Dict', 'Tuple', 'Union', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'combinations', 'levene', 'np', 'nx', 'outlier', 'pd', 'plt', 'print_normality', 'sch', 'sm', 'sns', 'sp', 'squareform', 'stat_test', 'stats', 'tabulate', 'test_normality']


In [68]:
# -------
# library
# -------

# Standard library
from itertools import combinations

# Typing
from typing import Any, Dict, Tuple, Union

# Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Reporting
from tabulate import tabulate

# Statistical analysis
import scipy.stats as stats
from scipy.stats import levene
from scipy.spatial.distance import squareform

# Clustering
import scipy.cluster.hierarchy as sch

# Post-hoc tests
import scikit_posthocs as sp

# Modeling
import statsmodels.api as sm

# Network analysis
import networkx as nx

In [69]:
# ---------------------------
# 전처리를 완료한 csv 파일 불러오기
# ---------------------------
df = pd.read_csv('/Users/hyeom/Documents/GitHub/advanced_project/jiwon_project/csv_files/preprocessing_filtered.csv')

In [54]:
# ----------------------------
# price 컬럼 정규성 및 분포 확인하기
# ----------------------------
# 1. price 컬럼 정규성 검정 결과
price_normality = test_normality(df['price'])

| 검정항목                | 통계량        | p-값 / 임계값   | 판정   |
|-------------------------|---------------|-----------------|--------|
| Skew (왜도)             | 24.421        | -               | -      |
| Excess Kurtosis (첨도)  | 893.551       | -               | -      |
| Shapiro-Wilk            | 0.238         | p=0.000         | Reject |
| D’Agostino K²           | 53264.039     | p=0.000         | Reject |
| Jarque–Bera             | 744028276.333 | p=0.000         | Reject |
| Lilliefors KS           | 0.331         | p=0.001         | Reject |
| Anderson–Darling @15.0% | 3866.117      | crit=0.576      | Reject |
| Anderson–Darling @10.0% | 3866.117      | crit=0.656      | Reject |
| Anderson–Darling @5.0%  | 3866.117      | crit=0.787      | Reject |
| Anderson–Darling @2.5%  | 3866.117      | crit=0.918      | Reject |
| Anderson–Darling @1.0%  | 3866.117      | crit=1.092      | Reject |
| Practical Normal        | -             | -               | False  |


In [72]:

# -------
# library
# -------

# Standard library
from itertools import combinations

# Typing
from typing import Any, Dict, Tuple, Union

# Data handling
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Reporting
from tabulate import tabulate

# Statistical analysis
import scipy.stats as stats
from scipy.stats import levene
from scipy.spatial.distance import squareform

# Clustering
import scipy.cluster.hierarchy as sch

# Post-hoc tests
import scikit_posthocs as sp

# Modeling
import statsmodels.api as sm

# Network analysis
import networkx as nx

# ------------------------------
# 전처리를 완료한 csv 파일 불러오기
# ------------------------------
df = pd.read_csv('/Users/hyeom/Documents/GitHub/advanced_project/jiwon_project/csv_files/preprocessing_filtered.csv')

# ------------------------------
# 이상치를 정의할 기준 컬럼 찾기
# 2) room_structure_type price의 연관성
# ------------------------------
X2 = 'room_structure_type'
y = 'log_price'
alpha = 0.05
max_shapiro_n = 5000

print('\n=== room structure type 과 price 연관성 가설 검정 ===')
# 1. 정규성과 등분산성 검정 후, Kruskal-Wallis 검정 진행
X2_stat_test = stat_test.decide(df, X2, y, alpha=alpha, verbose=True)

# 2. Kruskal-Wallis 검정 진행, 사후 검정은 Dunn(holm 보정)
X2_res = stat_test.kruskal_dunn(df, X2, y, alpha=alpha, adjust='holm', verbose=True)

# 3. 검정결과 시각화
if X2_res['pvals_matrix'] is not None:
    stat_test.p_heatmap(
        X2_res['pvals_matrix'],
        alpha=alpha,
        clip_upper=0.05,
        annot_mode="none",
        cmap="rocket_r",
        figsize=(8, 6),
        text_color="black",
    )
    plt.show()

# 4. room structure type 으로 이상치 시각화
outlier.boxplot(df, X2, y, factor=1.5, figsize=(8,6), tablefmt='github', verbose=True)

# -> 적어도 한 개 이상의 room structure type 쌍의 price 분포가 통계적으로 다르다.
# -> 따라서 room structure type 을 사용하여 price 이상치를 판단할 수는 있지만,
# -> 박스플롯 확인 결과 room structure type 은 price 이상치를 완전하게 설명하지 못한다.

# ------------------------------------------------------------------------------------------
# 이상치를 정의할 기준 컬럼 찾기
# 3) room_structure_type 그룹 간 p-value 검사 결과를 바탕으로 새로운 카테고리 제안 (p-value 거리로 군집화)
# ------------------------------------------------------------------------------------------
pmat = X2_res['pvals_matrix']          # Dunn 사후 p-value DataFrame
index = pmat.index

# 1. p 값을 [ε, 1] 범위로 고정
P = np.clip(pmat.values, 1e-10, 1.0)

# 2. 거리 = -log10(p),  p=1 → 0
D = -np.log10(P)
np.fill_diagonal(D, 0)

# 3. linkage (average·k=5 예시)
Z      = sch.linkage(squareform(D), method='average')
labels = sch.fcluster(Z, t=5, criterion='maxclust')

# 4. 매핑
struct_grp_map = dict(zip(index, labels))
df['room_new_type'] = df['room_structure_type'].map(struct_grp_map)

print("\n=== 군집화로 도출한 새로운 그룹 카테고리 ===")
for k in sorted(set(labels)):
    print(f"Group {k}: {[s for s,l in struct_grp_map.items() if l==k]}")

# 5. 군집별 표본 수 & 로그 가격 통계
grp_stat = (
    df.groupby('room_new_type')['price']
      .agg(n='size', median='median', q1=lambda s: s.quantile(.25), q3=lambda s: s.quantile(.75))
      .sort_values('median')
)

# 6. 중앙값과 2사분위, 3사분위를 고려하여 재배치
# - 5번그룹(townhouse)은 금액 특성상 4번그룹에 통합 가능
# - barn, kezhan, ranch, dome은 개수가 부족해 군집화가 불가능 -> 적합한 가격군에 배치
df.loc[df.room_structure_type == 'townhouse', 'room_new_type'] = 3
df.loc[df.room_structure_type == 'barn', 'room_new_type'] = 1
df.loc[df.room_structure_type == 'kezhan', 'room_new_type'] = 4
df.loc[df.room_structure_type == 'ranch', 'room_new_type'] = 4
df.loc[df.room_structure_type == 'dome', 'room_new_type'] = 4

# 7. 그룹명 변경
# - 금액대 유사한 그룹끼리 배치했기에, high, upper mid, mid, low mid로 변경
df['room_new_type'] = df['room_new_type'].astype(int)

group_name_map = {4: "Low-Mid", 3: "Mid", 1: "Upper-Mid",  2: "High"}
df['room_new_type'] = df['room_new_type'].map(group_name_map)



=== room structure type 과 price 연관성 가설 검정 ===


AttributeError: type object 'stat_test' has no attribute 'decide'

In [None]:
# -> kruskal 검정 결과, 유의하지 않은 그룹쌍은 없었다.
# -> 이후, 박스플롯으로 이상치를 확인해보면 해당 그룹은 이상치를 잘 설명하고 있음을 알 수 있다.

stats_type = df.groupby("room_new_type")['price'].apply(outlier.stats)
print("=== room_new_type 별 price 이상치 ===")

# 인덱스를 컬럼으로 올리기
stats_type = stats_type.reset_index()

# apply 결과를 MultiIndex → DataFrame 으로 펼치기
stats_type = (
    df.groupby("room_new_type")['price']
      .apply(outlier.stats)
      .unstack()              # outlier_count, outlier_ratio 가 각각 컬럼이 됨
      .reset_index()          # 구조를 DataFrame으로 완성
)
type_outlier_count = stats_type['outlier_count'].sum()
type_outlier_ratio = stats_type['outlier_count'].sum() / 22308

print(f"전체 이상치 개수: {type_outlier_count}")
print(f"전체 이상치 비율: {type_outlier_ratio:.4f}")

df.groupby('room_new_type')['price'].describe()
stats_clean = (
    df.groupby('room_new_type')['price']
      .apply(outlier.describe_without)   # → 다중 인덱스 Series
      .unstack()                    # → 행: structure_group, 열: describe 항목
)

mask = df.groupby('room_new_type')['price'] \
         .transform(lambda s: outlier.is_not(s, factor=1.5))

# 2) 이상치가 아닌 행만 골라 새로운 DataFrame에 저장
outlier_removed_df = df[mask].copy()

# 3) 확인 (원본 vs 제거 후 행 개수)
print(f"Original rows: {len(df)}, Without outliers: {len(outlier_removed_df)}")

# 4) 필요하다면 인덱스 리셋
outlier_removed_df.reset_index(drop=True, inplace=True)

In [None]:
outlier_removed_df.to_csv('/Users/hyeom/Documents/GitHub/advanced_project/Airbnb_project_15/outlier_removed.csv', index=False)

In [77]:
csv_path = 'outlier_removed.csv'    # 여기에 absolute path

# CSV 읽기
df = pd.read_csv(
    csv_path,
    header=0,        # 첫 줄을 컬럼명으로 사용
    index_col='id',  # 인덱스 컬럼으로 지정하지 않음
    encoding='utf-8-sig'
)

# 데이터 확인
df.head(2)


# 3) 혹시 Unnamed 컬럼이 남아 있다면 제거
'''if any(col.startswith('Unnamed') for col in df.columns):
    df = df.loc[:, ~df.columns.str.startswith('Unnamed')]'''

# 4) 데이터 확인
df.head()

Unnamed: 0_level_0,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price,estimated_occupancy_l365d,estimated_revenue_l365d,availability_60,availability_365,instant_bookable,...,host_acceptance_rate_score,host_location_ny,host_location_boolean,log_price,room_new_type,median,lower,upper,n,is_outlier
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
36121,Prospect Heights,Brooklyn,40.67376,-73.96611,200.0,0,0.0,57,362,0,...,4,1,1,5.303305,Mid,139.0,-145.6592,423.6592,16610,False
36647,East Harlem,Manhattan,40.792454,-73.940742,82.0,0,0.0,0,204,0,...,4,1,1,4.418841,Upper-Mid,193.0,-165.0479,551.0479,1722,False
38663,Boerum Hill,Brooklyn,40.68442,-73.98068,765.0,0,0.0,49,326,0,...,2,1,1,6.641182,Low-Mid,99.0,-92.2554,290.2554,3130,True
38833,Harlem,Manhattan,40.818058,-73.946671,139.0,255,35445.0,18,25,0,...,4,1,1,4.941642,Low-Mid,99.0,-92.2554,290.2554,3130,False
39282,Williamsburg,Brooklyn,40.710651,-73.950874,130.0,154,20020.0,22,38,0,...,4,1,1,4.875197,Mid,139.0,-145.6592,423.6592,16610,False
