In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# 1. 기존 데이터셋 학습 → 모델 만들기

In [2]:
hotel=pd.read_csv('Hotel Reservations_original.csv')

categorical = ['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'arrival_month', 'arrival_year',  'market_segment_type', 'repeated_guest','booking_status']
for i in categorical:
  hotel[i] = hotel[i].astype('category')

X = hotel.drop(['booking_status'], axis=1)
y = hotel['booking_status']

# One hot encoding
X_ohe = pd.get_dummies(X)

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_ohe, y, test_size = 0.2)

# Random Forest 모델
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [3]:
# accuracy
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# cross_val_score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_model, X_ohe, y, scoring='roc_auc', cv=5)
print(f"Scores: {scores.mean():.4f}")

Accuracy: 0.9063
Scores: 0.9534


# 2. 새로운 데이터 입력받기 → class 예측

In [6]:
## 데이터 로드 함수
def load_data():
  hotel = pd.read_csv("Hotel Reservations_original.csv")
  # 범주형 컬럼을 'category' 타입으로 변환
  categorical = ['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved',
                'arrival_year', 'arrival_month', 'arrival_date', 'market_segment_type', 'repeated_guest', 'booking_status']
  for i in categorical:
      hotel[i] = hotel[i].astype('category')
  return hotel

data = load_data().iloc[:, :-1]

# 범주형 컬럼 조회
def get_categorical_columns(data):
    return data.select_dtypes(include=['object', 'bool', 'category']).columns

# 수치형 컬럼 조회
def get_numeric_columns(data):
    return data.select_dtypes(include=['number']).columns

# 새로운 데이터 입력 받기
def input_new_data_with_columns(data):
    new_data = []
    columns = data.columns.tolist()  # 컬럼명 리스트 생성

    for attr in columns:
        if attr in get_categorical_columns(data):  # 범주형 컬럼 처리
            unique_values = data[attr].unique()
            print(f"{attr}의 선택지: {', '.join(map(str, unique_values))}")
            value = input(f"{attr}값을 입력하세요: ")
            new_data.append(value)
        elif attr in get_numeric_columns(data):  # 수치형 컬럼 처리
            min_value = data[attr].min()  # 최솟값
            max_value = data[attr].max()  # 최댓값
            print(f"{attr}의 범위: {min_value} ~ {max_value}")
            value = input(f"{attr}값을 입력하세요: ")
            new_data.append(value)

    return new_data

# 사용자로부터 새로운 데이터 입력 받기
new_data = input_new_data_with_columns(data)

# 입력 받은 데이터와 X.columns 순서가 맞도록 정리
new_data_dict = dict(zip(data.columns, new_data))

ordered_new_data = []
for feature in data.columns:
    value = new_data_dict[feature]
    if feature in get_numeric_columns(data):
        ordered_new_data.append(float(value))  # 수치형 데이터는 float로 변환
    else:
        ordered_new_data.append(str(value))  # 범주형 데이터는 문자열로 변환

print(f"New data input: {ordered_new_data}")

no_of_adults의 범위: 0 ~ 4


no_of_adults값을 입력하세요:  2


no_of_children의 범위: 0 ~ 10


no_of_children값을 입력하세요:  2


no_of_weekend_nights의 범위: 0 ~ 7


no_of_weekend_nights값을 입력하세요:  1


no_of_week_nights의 범위: 0 ~ 17


no_of_week_nights값을 입력하세요:  2


type_of_meal_plan의 선택지: Meal Plan 1, Not Selected, Meal Plan 2, Meal Plan 3


type_of_meal_plan값을 입력하세요:  Meal Plan 1


required_car_parking_space의 선택지: 0, 1


required_car_parking_space값을 입력하세요:  1


room_type_reserved의 선택지: Room_Type 1, Room_Type 4, Room_Type 2, Room_Type 6, Room_Type 5, Room_Type 7, Room_Type 3


room_type_reserved값을 입력하세요:  Room_Type 4


lead_time의 범위: 0 ~ 443


lead_time값을 입력하세요:  31


arrival_year의 선택지: 2017, 2018


arrival_year값을 입력하세요:  2018


arrival_month의 선택지: 10, 11, 2, 5, 4, 9, 12, 7, 6, 8, 3, 1


arrival_month값을 입력하세요:  7


arrival_date의 선택지: 2, 6, 28, 20, 11, 13, 15, 26, 18, 30, 5, 10, 4, 25, 22, 21, 19, 17, 7, 9, 27, 1, 29, 16, 3, 24, 14, 31, 23, 8, 12


arrival_date값을 입력하세요:  31


market_segment_type의 선택지: Offline, Online, Corporate, Aviation, Complementary


market_segment_type값을 입력하세요:  Online


repeated_guest의 선택지: 0, 1


repeated_guest값을 입력하세요:  0


no_of_previous_cancellations의 범위: 0 ~ 13


no_of_previous_cancellations값을 입력하세요:  0


no_of_previous_bookings_not_canceled의 범위: 0 ~ 58


no_of_previous_bookings_not_canceled값을 입력하세요:  0


avg_price_per_room의 범위: 0.0 ~ 540.0


avg_price_per_room값을 입력하세요:  400


no_of_special_requests의 범위: 0 ~ 5


no_of_special_requests값을 입력하세요:  1


New data input: [2.0, 2.0, 1.0, 2.0, 'Meal Plan 1', '1', 'Room_Type 4', 31.0, '2018', '7', '31', 'Online', '0', 0.0, 0.0, 400.0, 1.0]


In [9]:
# 입력받은 데이터를 DataFrame으로 변환
new_data_df = pd.DataFrame([ordered_new_data], columns=data.columns)
new_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 17 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   no_of_adults                          1 non-null      float64
 1   no_of_children                        1 non-null      float64
 2   no_of_weekend_nights                  1 non-null      float64
 3   no_of_week_nights                     1 non-null      float64
 4   type_of_meal_plan                     1 non-null      object 
 5   required_car_parking_space            1 non-null      object 
 6   room_type_reserved                    1 non-null      object 
 7   lead_time                             1 non-null      float64
 8   arrival_year                          1 non-null      object 
 9   arrival_month                         1 non-null      object 
 10  arrival_date                          1 non-null      object 
 11  market_segment_type    

In [11]:
categorical_new = ['type_of_meal_plan', 'required_car_parking_space', 'room_type_reserved', 'arrival_month', 'arrival_year',  'market_segment_type', 'repeated_guest']
for i in categorical_new:
  new_data_df[i] = new_data_df[i].astype('category')

# One hot encoding
X_ohe_new = pd.get_dummies(new_data_df)
X_ohe_new

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,lead_time,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,type_of_meal_plan_Meal Plan 1,required_car_parking_space_1,room_type_reserved_Room_Type 4,arrival_year_2018,arrival_month_7,arrival_date_31,market_segment_type_Online,repeated_guest_0
0,2.0,2.0,1.0,2.0,31.0,0.0,0.0,400.0,1.0,True,True,True,True,True,True,True,True


In [13]:
# 훈련 데이터의 열에 맞도록 조정
X_ohe_new_adj = X_ohe_new.reindex(columns=X_ohe.columns)
X_ohe_new_adj

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,lead_time,arrival_date,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,...,arrival_month_10,arrival_month_11,arrival_month_12,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,repeated_guest_0,repeated_guest_1
0,2.0,2.0,1.0,2.0,31.0,,0.0,0.0,400.0,1.0,...,,,,,,,,True,True,


In [15]:
# 예측
prediction = rf_model.predict(X_ohe_new_adj)
print(f"예측된 booking_status: {prediction[0]}")

예측된 booking_status: Not_Canceled
