In [19]:
import numpy as np
import pandas as pd
import os
from pyspark.sql import SparkSession
from math import log
from pyspark.ml.feature import QuantileDiscretizer

In [20]:
spark_home = "D:/Programs/Spark/spark_unzipped"
os.environ["SPARK_HOME"] = spark_home

# Add Spark bin and executors to PATH
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "bin")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "sbin")

# Add Spark Python libraries to PYTHONPATH
os.environ["PYTHONPATH"] = os.path.join(spark_home, "python") + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] += os.pathsep + os.path.join(spark_home, "python", "lib")

# Add PySpark to the system path
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "pyspark.zip")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "py4j-0.10.9-src.zip")
os.environ['PYSPARK_PYTHON'] = 'python'

In [21]:
spark = SparkSession.builder.appName("HotelBooking").getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [22]:
# Read the data from the CSV file
df = spark.read.option("header", True).option("inferSchema", True).csv('hotel_bookings_cleaned.csv')

In [23]:
y_train = df.select("is_canceled")
y_train

DataFrame[is_canceled: int]

In [24]:
# Remove the target column
df = df.drop('is_canceled')

In [25]:
# Select the numeric columns of type int, float
df_numeric = df.select([column for column in df.columns if str(df.schema[column].dataType) in ['IntegerType()', 'DoubleType()']])

# Display the count of rows and columns of df
print((df.count(), len(df.columns)))

# Convert to list
columns_numeric = df_numeric.columns
print(columns_numeric)

(118732, 27)
['lead_time', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [26]:
discretizer = QuantileDiscretizer(numBuckets=10, inputCols=columns_numeric, outputCols=[column + "_discretized" for column in columns_numeric])

In [27]:
df_discretized = discretizer.fit(df).transform(df)

In [28]:
print(len(df_discretized.columns))

42


In [29]:
# Remove the original columns
df_discretized = df_discretized.drop(*columns_numeric)

In [30]:
# Merge the target column with the discretized columns
df_discretized = df_discretized.join(y_train)

In [31]:
df_discretized.show()

+------------+------------+----+-------+--------------+--------------------+------------------+------------------+------------+-------------+------------------+-----------------------+---------------------+------------------------------------+-----------------------------------+--------------------------------+------------------+--------------------+------------------+-----------------------------+----------------------------------+------------------------------------------+---------------------------+--------------------------------+---------------+---------------------------------------+-------------------------------------+-----------+
|       hotel|arrival_date|meal|country|market_segment|distribution_channel|reserved_room_type|assigned_room_type|deposit_type|customer_type|reservation_status|reservation_status_date|lead_time_discretized|arrival_date_week_number_discretized|stays_in_weekend_nights_discretized|stays_in_week_nights_discretized|adults_discretized|children_discretized|b

## Naive Bayes Classifier

In [32]:
label_column = "is_canceled"
features_columns = df_discretized.columns
print(features_columns)

['hotel', 'arrival_date', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'reservation_status_date', 'lead_time_discretized', 'arrival_date_week_number_discretized', 'stays_in_weekend_nights_discretized', 'stays_in_week_nights_discretized', 'adults_discretized', 'children_discretized', 'babies_discretized', 'is_repeated_guest_discretized', 'previous_cancellations_discretized', 'previous_bookings_not_canceled_discretized', 'booking_changes_discretized', 'days_in_waiting_list_discretized', 'adr_discretized', 'required_car_parking_spaces_discretized', 'total_of_special_requests_discretized', 'is_canceled']


#### Prior Probabilities

In [33]:
# [(c0, 1), (c1, 1)]
def mapper(rows):
    result = []
    for row in rows:
        result.append((row[label_column], 1))
    return result

# [(c0, 5), (c1, 10)]
def reducer(row):
    key, values = row
    return [(key, sum(values))]

In [34]:
df_discretized = df_discretized.take(100)
df_discretized = sc.parallelize(df_discretized)

In [35]:
# # Testing the mapper and reducer functions
# # Pick 10 rows from df_discretized
# rows = df_discretized.take(10)
# df_test = mapper(rows)
# print("Map Output: ", df_test)

# # Group by the label column
# df_test_grouped = sc.parallelize(df_test).groupByKey().flatMap(reducer)
# print("Reducer Output: ", df_test_grouped.collect())

In [36]:
# prior = df_discretized.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer)
prior = df_discretized.mapPartitions(mapper).groupByKey().flatMap(reducer)

In [38]:
prior = prior.collect()
total = sum([x[1] for x in prior])
prior_dict = dict([(x[0], x[1]/total) for x in prior])
print("Prior probability: ", prior_dict)

Prior probability:  {0: 0.77, 1: 0.23}


#### Likelihood Probabilities

In [41]:
# dict = { 
#     c0: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         },
#     c1: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         }
# }

def mapper(split):
    mapper_result = []
    mapper_dict = {}

    for row in split:
        label = row[label_column]
        if label not in mapper_dict:
            mapper_dict[label] = {}

        for feature in features_columns:
            value = row[feature]
            if feature not in mapper_dict[label]:
                mapper_dict[label][feature] = {}

            if value not in mapper_dict[label][feature]:
                mapper_dict[label][feature][value] = 0

            mapper_dict[label][feature][value] += 1

    for label in mapper_dict:
        for feature in mapper_dict[label]:
            for value in mapper_dict[label][feature]:
                mapper_result.append((label, (feature, value, mapper_dict[label][feature][value])))
    return mapper_result

# [<c0, (f1, v1, 5)>, <c0, (f1, v2, 10)>]
# [<c1, (f1, v1, 2)>, <c1, (f1, v2, 3)>]

def reducer1(row):
    key, tups = row
    reducer1_dict = {}
    reducer1_result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in reducer1_dict:
            reducer1_dict[feature] = {}

        if value not in reducer1_dict[feature]:
            reducer1_dict[feature][value] = 0

        reducer1_dict[feature][value] += count

    for feature in reducer1_dict:
        for value in reducer1_dict[feature]:
            reducer1_result.append((key, (feature, value, reducer1_dict[feature][value])))

    return reducer1_result

# <c0, (f1, v1, 10, 15)>
# <c0, (f1, v2, 5, 15)>
# <c1, (f1, v1, 2, 5)>
# <c1, (f1, v2, 3, 5)>

def reducer2(row):
    label, tups = row
    reducer2_dict = {}
    reducer2_result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in reducer2_dict:
            reducer2_dict[feature] = 0

        reducer2_dict[feature] += count

    for tup in tups:
        feature, value, count = tup
        reducer2_result.append((label, (feature, value, count, reducer2_dict[feature])))

    return reducer2_result

In [None]:
# likelihood = df_discretized.rdd.mapPartitions(mapper).groupByKey().flatMap(reducer1).groupByKey().flatMap(reducer2).collect()
likelihood = df_discretized.mapPartitions(mapper).groupByKey().flatMap(reducer1).groupByKey().flatMap(reducer2).collect()

In [44]:
# Print likelihood using pandas
likelihood_df = pd.DataFrame(likelihood, columns=["Label", "Feature"])
likelihood_df

Unnamed: 0,Label,Feature
0,0,"(hotel, Resort Hotel, 77, 77)"
1,0,"(arrival_date, 2015-07-01, 77, 77)"
2,0,"(meal, BB, 77, 77)"
3,0,"(country, PRT, 77, 77)"
4,0,"(market_segment, Direct, 77, 77)"
5,0,"(distribution_channel, Direct, 77, 77)"
6,0,"(reserved_room_type, C, 77, 77)"
7,0,"(assigned_room_type, C, 77, 77)"
8,0,"(deposit_type, No Deposit, 77, 77)"
9,0,"(customer_type, Transient, 77, 77)"
