In [18]:
import numpy as np
import pandas as pd
import spark
from math import log
from pyspark.sql import SparkSession
from pyspark.ml.feature import QuantileDiscretizer

In [19]:
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [20]:
# Read the data from the CSV file
df = spark.read.option("header", True).option("inferSchema", True).csv('hotel_bookings_cleaned.csv')

In [21]:
y_train = df.select("is_canceled")
y_train

DataFrame[is_canceled: int]

In [22]:
# Remove the target column
df = df.drop('is_canceled')

In [23]:
# Select the numeric columns of type int, float
df_numeric = df.select([column for column in df.columns if str(df.schema[column].dataType) in ['IntegerType()', 'DoubleType()']])

# Display the count of rows and columns of df
print((df.count(), len(df.columns)))

# Convert to list
columns_numeric = df_numeric.columns
print(columns_numeric)

(118732, 27)
['lead_time', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [24]:
discretizer = QuantileDiscretizer(numBuckets=10, inputCols=columns_numeric, outputCols=[column + "_discretized" for column in columns_numeric])

In [25]:
df_discretized = discretizer.fit(df).transform(df)

In [27]:
print(len(df_discretized.columns))

42


In [28]:
# Remove the original columns
df_discretized = df_discretized.drop(*columns_numeric)

In [29]:
df_discretized.show()

+------------+------------+----+-------+--------------+--------------------+------------------+------------------+------------+-------------+------------------+-----------------------+---------------------+------------------------------------+-----------------------------------+--------------------------------+------------------+--------------------+------------------+-----------------------------+----------------------------------+------------------------------------------+---------------------------+--------------------------------+---------------+---------------------------------------+-------------------------------------+
|       hotel|arrival_date|meal|country|market_segment|distribution_channel|reserved_room_type|assigned_room_type|deposit_type|customer_type|reservation_status|reservation_status_date|lead_time_discretized|arrival_date_week_number_discretized|stays_in_weekend_nights_discretized|stays_in_week_nights_discretized|adults_discretized|children_discretized|babies_discre

## Naive Bayes Classifier

In [30]:
label_column = "is_canceled"
features_columns = df_discretized.columns
print(features_columns)

['hotel', 'arrival_date', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'reservation_status_date', 'lead_time_discretized', 'arrival_date_week_number_discretized', 'stays_in_weekend_nights_discretized', 'stays_in_week_nights_discretized', 'adults_discretized', 'children_discretized', 'babies_discretized', 'is_repeated_guest_discretized', 'previous_cancellations_discretized', 'previous_bookings_not_canceled_discretized', 'booking_changes_discretized', 'days_in_waiting_list_discretized', 'adr_discretized', 'required_car_parking_spaces_discretized', 'total_of_special_requests_discretized']


#### Prior Probabilities

In [31]:
# [(c0, 1), (c1, 1)]
def mapper(rows):
    result = []
    for row in rows:
        result.append((row[label_column], 1))
    return result

# [(c0, 5), (c1, 10)]
def reducer(row):
    key, values = row
    return [(key, sum(values))]

In [None]:
prior = df_discretized.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer).collect()
total = sum([x[1] for x in prior])
prior_dict = dict([(x[0], x[1]/total) for x in prior])
print("Prior probability: ", prior_dict)

#### Likelihood Probabilities

In [33]:
# dict = { 
#     c0: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         },
#     c1: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         }
# }

def mapper(split):
    result = []
    dict = {}

    for row in split:
        label = row[label_column]
        if label not in dict:
            dict[label] = {}

        for feature in features_columns:
            value = row[feature]
            if feature not in dict[label]:
                dict[label][feature] = {}

            if value not in dict[label][feature]:
                dict[label][feature][value] = 0

            dict[label][feature][value] += 1

    for label in dict:
        for feature in dict[label]:
            for value in dict[label][feature]:
                result.append((label, (feature, value, dict[label][feature][value])))
    return result

# [<c0, (f1, v1, 5)>, <c0, (f1, v2, 10)>]
# [<c1, (f1, v1, 2)>, <c1, (f1, v2, 3)>]

def reducer1(row):
    key, tups = row
    dict = {}
    result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in dict:
            dict[feature] = {}

        if value not in dict[feature]:
            dict[feature][value] = 0

        dict[feature][value] += count

    for feature in dict:
        for value in dict[feature]:
            result.append((key, (feature, value, dict[feature][value])))

    return result

# <c0, (f1, v1, 10, 15)>
# <c0, (f1, v2, 5, 15)>
# <c1, (f1, v1, 2, 5)>
# <c1, (f1, v2, 3, 5)>

def reducer2(row):
    label, tups = row
    dict = {}
    result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in dict:
            dict[feature] = 0

        dict[feature] += count

    for tup in tups:
        feature, value, count = tup
        result.append((label, (feature, value, count, dict[feature])))

    return result

In [None]:
likelihood = df_discretized.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer1).groupByKey().flatMap(reducer2).collect()
print(likelihood)