In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [78]:
df = pd.read_csv('clean_data.csv')

In [79]:
df.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,days_in_waiting_list,customer_type,required_car_parking_spaces,total_of_special_requests,arrival_date_quarter,total_orang,days,lead_time_special_requests_ratio,cancellation_ratio,guest_night_interaction
0,0,Resort Hotel,0,13,2015,July,27,1,0,1,...,0,Transient,0,0,Quarter_3,1,1,13.0,0.0,1
1,1,Resort Hotel,0,14,2015,July,27,1,0,2,...,0,Transient,0,1,Quarter_3,2,2,7.0,0.0,4
2,2,Resort Hotel,0,9,2015,July,27,1,0,2,...,0,Transient,0,1,Quarter_3,2,2,4.5,0.0,4
3,3,Resort Hotel,1,85,2015,July,27,1,0,3,...,0,Transient,0,1,Quarter_3,2,3,42.5,0.0,6
4,4,Resort Hotel,1,75,2015,July,27,1,0,3,...,0,Transient,0,0,Quarter_3,2,3,75.0,0.0,6


In [80]:
import numpy as np

class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)
            

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)

        # return class with the highest posterior
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [81]:
selected_features = [
    'is_canceled',
    'hotel',
    'stays_in_weekend_nights',
    'adults',
    'children',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'deposit_type',
    'total_orang'
]

# selected_features = [
#     'is_canceled',
#     'hotel',
#     'stays_in_weekend_nights',
#     'adults',
#     'children',
#     'market_segment',
#     'total_of_special_requests',
#     'customer_type',
#     'is_repeated_guest',
#     'previous_cancellations',
#     'previous_bookings_not_canceled',
#     'reserved_room_type',
#     'deposit_type',
#     'total_orang'
# ]

df = df[selected_features]

In [82]:
df.head()

Unnamed: 0,is_canceled,hotel,stays_in_weekend_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,deposit_type,total_orang
0,0,Resort Hotel,0,1,0,0,0,0,No Deposit,1
1,0,Resort Hotel,0,2,0,0,0,0,No Deposit,2
2,0,Resort Hotel,0,2,0,0,0,0,No Deposit,2
3,1,Resort Hotel,0,2,0,0,0,0,No Deposit,2
4,1,Resort Hotel,0,2,0,0,0,0,No Deposit,2


In [83]:
df = pd.get_dummies(df, columns=['deposit_type'])
df = pd.get_dummies(df, columns=['hotel'])
# df = pd.get_dummies(df, columns=['market_segment'])
# df = pd.get_dummies(df, columns=['customer_type'])
# df = pd.get_dummies(df, columns=['reserved_room_type'])
df.head()

Unnamed: 0,is_canceled,stays_in_weekend_nights,adults,children,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,total_orang,deposit_type_No Deposit,deposit_type_Non Refund,deposit_type_Refundable,hotel_City Hotel,hotel_Resort Hotel
0,0,0,1,0,0,0,0,1,True,False,False,False,True
1,0,0,2,0,0,0,0,2,True,False,False,False,True
2,0,0,2,0,0,0,0,2,True,False,False,False,True
3,1,0,2,0,0,0,0,2,True,False,False,False,True
4,1,0,2,0,0,0,0,2,True,False,False,False,True


In [84]:
df = df.astype(int)
df.dtypes

is_canceled                       int32
stays_in_weekend_nights           int32
adults                            int32
children                          int32
is_repeated_guest                 int32
previous_cancellations            int32
previous_bookings_not_canceled    int32
total_orang                       int32
deposit_type_No Deposit           int32
deposit_type_Non Refund           int32
deposit_type_Refundable           int32
hotel_City Hotel                  int32
hotel_Resort Hotel                int32
dtype: object

In [85]:
df.corr()['is_canceled'].sort_values(ascending=False)

is_canceled                       1.000000
deposit_type_Non Refund           0.138105
total_orang                       0.077741
adults                            0.062883
hotel_City Hotel                  0.061073
previous_cancellations            0.059429
children                          0.053068
stays_in_weekend_nights           0.032081
deposit_type_Refundable           0.013675
previous_bookings_not_canceled   -0.040323
is_repeated_guest                -0.051505
hotel_Resort Hotel               -0.061073
deposit_type_No Deposit          -0.138136
Name: is_canceled, dtype: float64

In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Contoh data frame
# Misalkan df memiliki kolom 'feature1', 'feature2', ..., 'featureN' dan 'target' untuk y.
# df = pd.DataFrame({...})

# Pisahkan fitur (X) dan target (y)
X, y = df.drop(columns=['is_canceled']).values, df['is_canceled'].values

# Membagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Membuat model Naive Bayes
model = GaussianNB()

# Melatih model
model.fit(X_train, y_train)

# Membuat prediksi pada data uji
y_pred = model.predict(X_test)

# Mengukur akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi model Naive Bayes: {accuracy:.2f}")

Akurasi model Naive Bayes: 0.72


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X, y = df.drop(columns=['is_canceled']).values, df['is_canceled'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# print("X_train shape:", X_train.shape)
# print("X_test shape:", X_test.shape)
# print("y_train shape:", y_train.shape)
# print("y_test shape:", y_test.shape)

nb = NaiveBayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

X_train shape: (55931, 12)
X_test shape: (13983, 12)
y_train shape: (55931,)
y_test shape: (13983,)


  posterior = np.sum(np.log(self._pdf(idx, x)))


Accuracy: 0.7193735249946364

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.97      0.83      9905
           1       0.61      0.10      0.18      4078

    accuracy                           0.72     13983
   macro avg       0.67      0.54      0.50     13983
weighted avg       0.69      0.72      0.64     13983

