In [44]:
import numpy as np
import pandas as pd
import os
from pyspark.sql import SparkSession

In [45]:
spark_home = "D:/Programs/Spark/spark_unzipped"
os.environ["SPARK_HOME"] = spark_home

# Add Spark bin and executors to PATH
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "bin")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "sbin")

# Add Spark Python libraries to PYTHONPATH
os.environ["PYTHONPATH"] = os.path.join(spark_home, "python") + os.pathsep + os.environ.get("PYTHONPATH", "")
os.environ["PYTHONPATH"] += os.pathsep + os.path.join(spark_home, "python", "lib")

# Add PySpark to the system path
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "pyspark.zip")
os.environ["PATH"] += os.pathsep + os.path.join(spark_home, "python", "lib", "py4j-0.10.9-src.zip")
os.environ['PYSPARK_PYTHON'] = 'python'

In [46]:
spark = SparkSession.builder.appName("HotelBooking").getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [47]:
# Read the data from the CSV file
df = spark.read.option("header", True).option("inferSchema", True).csv('hotel_bookings_transformed.csv')

In [48]:
y_train = df.select("is_canceled")
y_train

DataFrame[is_canceled: double]

In [49]:
x_data = df.drop('is_canceled')
x_data

DataFrame[lead_time: double, arrival_date_week_number: double, stays_in_weekend_nights: double, stays_in_week_nights: double, adults: double, children: double, babies: double, is_repeated_guest: double, previous_cancellations: double, previous_bookings_not_canceled: double, booking_changes: double, days_in_waiting_list: double, adr: double, required_car_parking_spaces: double, total_of_special_requests: double, hotel: double, meal: double, country: double, market_segment: double, distribution_channel: double, reserved_room_type: double, assigned_room_type: double, deposit_type: double, customer_type: double, reservation_status: double]

## Naive Bayes Classifier

In [50]:
label_column = "is_canceled"
features_columns = df.columns
print(features_columns)
print(len(features_columns))

['is_canceled', 'lead_time', 'arrival_date_week_number', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'hotel', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status']
26


#### Prior Probabilities

In [51]:
# Output: [(c0, 1), (c1, 1), (c0, 1), (c0, 1), (c1, 1)]
def mapper_prior(rows):
    result = []
    for row in rows:
        result.append((row[label_column], 1))
    return result

# Output: [(c0, 3), (c1, 2)]
def reducer_prior(row):
    key, values = row
    return [(key, sum(values))]

In [52]:
# Select the first 100 rows
# df_selected = df.take(100)
# df_selected = sc.parallelize(df_selected)

In [53]:
prior = df.rdd.mapPartitions(mapper_prior).groupByKey().flatMap(reducer_prior)
# prior = df_selected.mapPartitions(mapper_prior).groupByKey().flatMap(reducer_prior)

In [54]:
# Calculate the total count per class
total_count = prior.map(lambda x: x[1]).sum()

In [55]:
# Calculate prior probabilities
prior_probabilities = prior.map(lambda x: (x[0], x[1]/total_count)).collectAsMap()

print("Prior probability:", prior_probabilities)

Prior probability: {0.0: 0.628221540949365, 1.0: 0.37177845905063506}


#### Likelihood Probabilities

In [56]:
# dict = { 
#     c0: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         },
#     c1: { 
#             f1: { v1: 0, v2: 0}, 
#             f2: { v1: 0, v2: 0}
#         }
# }

def mapper_likelihood(split):
    mapper_result = []
    mapper_dict = {}

    for row in split:
        label = row[label_column]
        if label not in mapper_dict:
            mapper_dict[label] = {}

        for feature in features_columns:
            value = row[feature]
            if feature not in mapper_dict[label]:
                mapper_dict[label][feature] = {}

            if value not in mapper_dict[label][feature]:
                mapper_dict[label][feature][value] = 0

            mapper_dict[label][feature][value] += 1

    for label in mapper_dict:
        for feature in mapper_dict[label]:
            for value in mapper_dict[label][feature]:
                mapper_result.append((label, (feature, value, mapper_dict[label][feature][value])))
    return mapper_result

# Output:
# [<c0, (f1, v1, 5)>, <c0, (f1, v2, 10)>]
# [<c1, (f1, v1, 2)>, <c1, (f1, v2, 3)>]
def reducer1(row):
    key, tups = row
    reducer1_dict = {}
    reducer1_result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in reducer1_dict:
            reducer1_dict[feature] = {}

        if value not in reducer1_dict[feature]:
            reducer1_dict[feature][value] = 0

        reducer1_dict[feature][value] += count

    for feature in reducer1_dict:
        for value in reducer1_dict[feature]:
            reducer1_result.append((key, (feature, value, reducer1_dict[feature][value])))

    return reducer1_result

# Output:
# [<c0, (f1, v1, 10, 15)>, <c0, (f1, v2, 5, 15)>]
# [<c1, (f1, v1, 2, 5)>, <c1, (f1, v2, 3, 5)>]
def reducer2(row):
    label, tups = row
    reducer2_dict = {}
    reducer2_result = []

    for tup in tups:
        feature, value, count = tup
        if feature not in reducer2_dict:
            reducer2_dict[feature] = 0

        reducer2_dict[feature] += count

    for tup in tups:
        feature, value, count = tup
        reducer2_result.append((label, (feature, value, count, reducer2_dict[feature])))

    return reducer2_result

# Output: 
# [<c0, (f1, v1, 10/15)>, <c0, (f1, v2, 5/15)>]
# [<c1, (f1, v1, 2/5)>, <c1, (f1, v2, 3/5)>]
def reducer3(row):
    label, tups = row
    reducer3_result = []

    for tup in tups:
        feature, value, count, total = tup
        reducer3_result.append((label, (feature, value, count/total)))

    return reducer3_result

In [57]:
# likelihood = df_selected.mapPartitions(mapper_likelihood).groupByKey().flatMap(reducer1).groupByKey().flatMap(reducer2).groupByKey().flatMap(reducer3).collect()
likelihood = df.rdd.mapPartitions(mapper_likelihood).groupByKey().flatMap(reducer1).groupByKey().flatMap(reducer2).groupByKey().flatMap(reducer3).collect()
print(likelihood)

[(0.0, ('is_canceled', 0.0, 1.0)), (0.0, ('lead_time', 9.0, 0.053251106046386915)), (0.0, ('lead_time', 1.0, 0.1350717254323636)), (0.0, ('lead_time', 2.0, 0.11474728515886848)), (0.0, ('lead_time', 0.0, 0.14240514814318272)), (0.0, ('lead_time', 3.0, 0.10131384904142647)), (0.0, ('lead_time', 4.0, 0.10076417750368682)), (0.0, ('lead_time', 5.0, 0.09364526075881485)), (0.0, ('lead_time', 6.0, 0.08908700898243732)), (0.0, ('lead_time', 8.0, 0.08397908566832014)), (0.0, ('lead_time', 7.0, 0.08573535326451268)), (0.0, ('arrival_date_week_number', 4.0, 0.10229253251106046)), (0.0, ('arrival_date_week_number', 5.0, 0.09525405550341869)), (0.0, ('arrival_date_week_number', 6.0, 0.10176967421906422)), (0.0, ('arrival_date_week_number', 7.0, 0.09764043437458104)), (0.0, ('arrival_date_week_number', 8.0, 0.08337578763909372)), (0.0, ('arrival_date_week_number', 9.0, 0.1233811502882424)), (0.0, ('arrival_date_week_number', 0.0, 0.0923180050945167)), (0.0, ('arrival_date_week_number', 1.0, 0.1149