In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

plt.style.use("seaborn-v0_8-darkgrid")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["font.sans-serif"] = ["Arial Unicode MS", "DejaVu Sans"]
plt.rcParams["axes.unicode_minus"] = False

In [None]:
DATA_PATH = r"C:\Users\Nhuan\OneDrive - ut.edu.vn\Desktop\SEMESTER_7\DATA MINING\hotel-room-classification-pipeline\integration\unified\unified_hotels_rooms.csv"

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
df.shape
df.info()
df.describe(include="all").T

In [None]:
num_cols = [
    "stars", "rating", "review_count",
    "area_m2", "max_occupancy",
    "price_original", "price_final", "discount_pct"
]

for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df["free_cancel"] = df["free_cancel"].fillna(0)
df["breakfast"] = df["breakfast"].fillna(0)

df[num_cols].describe()

In [None]:
price = df["price_final"].dropna()
price = price[price > 0]

plt.hist(price, bins=50, edgecolor="black")
plt.title("Phân phối giá phòng (price_final)")
plt.xlabel("VND")
plt.ylabel("Số lượng")
plt.show()


In [None]:
plt.boxplot(price, vert=True)
plt.title("Boxplot giá phòng")
plt.ylabel("VND")
plt.show()

price.mean(), price.median()


In [None]:
city_price = (
    df.groupby("city")["price_final"]
    .agg(["mean", "median", "count"])
    .sort_values("count", ascending=False)
    .head(10)
)

city_price


In [None]:
city_price[["mean", "median"]].plot(kind="barh")
plt.title("Giá phòng theo thành phố (Top 10)")
plt.xlabel("VND")
plt.show()


In [None]:
rating = df["rating"].dropna()
rating = rating[(rating >= 0) & (rating <= 10)]

plt.hist(rating, bins=20, edgecolor="black", color="purple")
plt.title("Phân phối Rating")
plt.xlabel("Rating")
plt.show()


In [None]:
review = df["review_count"].dropna()
review = review[review >= 0]

plt.hist(review, bins=50, edgecolor="black", color="orange")
plt.title("Phân phối Review Count")
plt.xlabel("Số review")
plt.show()


In [None]:
valid = df[["price_final", "rating"]].dropna()
valid = valid[(valid["price_final"] > 0) & (valid["rating"] <= 10)]

plt.scatter(valid["price_final"], valid["rating"], alpha=0.3)
plt.xlabel("Giá phòng")
plt.ylabel("Rating")
plt.title("Giá phòng vs Rating")
plt.show()

valid["price_final"].corr(valid["rating"])


In [None]:
df["room_class"].value_counts().head(10)
df["room_class"].value_counts().head(10).plot(kind="barh")
plt.title("Top Room Class")
plt.show()
df["bed_type"].value_counts().head(10).plot(kind="barh")
plt.title("Top Bed Type")
plt.show()


In [None]:
area_df = df[["area_m2", "price_final"]].dropna()
area_df = area_df[(area_df["area_m2"] > 0) & (area_df["price_final"] > 0)]

plt.hist(area_df["area_m2"], bins=40, edgecolor="black")
plt.title("Phân phối diện tích phòng (m²)")
plt.xlabel("m²")
plt.show()


In [None]:
plt.scatter(area_df["area_m2"], area_df["price_final"], alpha=0.3)
plt.xlabel("Diện tích (m²)")
plt.ylabel("Giá phòng")
plt.title("Diện tích vs Giá phòng")
plt.show()


In [None]:
summary = {
    "Tổng số bản ghi": len(df),
    "Số khách sạn": df["hotel_name"].nunique(),
    "Số thành phố": df["city"].nunique(),
    "Giá trung bình": df["price_final"].mean(),
    "Rating trung bình": df["rating"].mean(),
    "Diện tích trung bình (m2)": df["area_m2"].mean(),
    "Room class phổ biến nhất": df["room_class"].mode()[0]
}

pd.Series(summary)
