In [None]:
from pyspark.sql import SparkSession, functions as fn
import merging
import importlib
importlib.reload(merging)

spark = (SparkSession.builder
         .config("spark.sql.ansi.enabled", "false") # Must be disabled because there is an invalid date.
         .getOrCreate())

## Leading Data

In [109]:
customer_df = spark.read.csv("../data/customer-reservations.csv", header=True,inferSchema=True)
hotel_df = spark.read.csv("../data/hotel-booking.csv", header=True,inferSchema=True)

In [110]:
customer_df.show(3)

+----------+-----------------------+--------------------+---------+------------+-------------+------------+-------------------+------------------+--------------+
|Booking_ID|stays_in_weekend_nights|stays_in_week_nights|lead_time|arrival_year|arrival_month|arrival_date|market_segment_type|avg_price_per_room|booking_status|
+----------+-----------------------+--------------------+---------+------------+-------------+------------+-------------------+------------------+--------------+
|  INN00001|                      1|                   2|      224|        2017|           10|           2|            Offline|              65.0|  Not_Canceled|
|  INN00002|                      2|                   3|        5|        2018|           11|           6|             Online|            106.68|  Not_Canceled|
|  INN00003|                      2|                   1|        1|        2018|            2|          28|             Online|              60.0|      Canceled|
+----------+----------------

In [111]:
hotel_df.show(3)

+------------+--------------+---------+------------+-------------+------------------------+-------------------------+-----------------------+--------------------+-------------------+-------+------------------+--------------------+
|       hotel|booking_status|lead_time|arrival_year|arrival_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|market_segment_type|country|avg_price_per_room|               email|
+------------+--------------+---------+------------+-------------+------------------------+-------------------------+-----------------------+--------------------+-------------------+-------+------------------+--------------------+
|Resort Hotel|             0|      342|        2015|         July|                      27|                        1|                      0|                   0|             Direct|    PRT|               0.0|Ernest.Barnes31@o...|
|Resort Hotel|             0|      737|        2015|         July|          

## Finding Commonalities

In [112]:
common_columns, common_types = merging.get_common_columns(customer_df, hotel_df)
print(list(zip(common_columns, common_types)))

[('arrival_year', 'int'), ('avg_price_per_room', 'double'), ('stays_in_weekend_nights', 'int'), ('lead_time', 'int'), ('stays_in_week_nights', 'int'), ('market_segment_type', 'string')]


We see that

```
['arrival_year', 'avg_price_per_room', 'stays_in_weekend_nights', 'lead_time', 'stays_in_week_nights', 'market_segment_type']
```

are shared between both datasets with the same data types and therefore could easily be present in a combined dataset. 




## Finding Differences

In [116]:
customer_unique_columns, cuc_types = merging.get_left_unique_columns(customer_df, hotel_df)
print(list(zip(customer_unique_columns, cuc_types)))

[('arrival_date', 'int'), ('arrival_month', 'int'), ('booking_status', 'string'), ('Booking_ID', 'string')]


In [117]:
hotel_unique_columns, huc_types = merging.get_left_unique_columns(hotel_df, customer_df)
print(list(zip(hotel_unique_columns, huc_types)))

[('email', 'string'), ('arrival_date_day_of_month', 'int'), ('arrival_month', 'string'), ('booking_status', 'int'), ('hotel', 'string'), ('country', 'string'), ('arrival_date_week_number', 'int')]


## Merging into a Single Dataset


### Filling in Missing Data

The `country`, `hotel`, and `email` are present in `hotel_df` but not `customer_df`.
These columns may be interesting for our analysis, so I will keep them in the merged data.
For the rows that come from `customer_df`, I will insert Null values for these three columns
in the merged data.



### Columns that can be dropped
Of those unique to `customer_df`,
- `Booking_ID` can be dropped since it has no meaning for our purposes.

Of those unique to `hotel_df`,
- `arrival_date_week_number` can be dropped because it is redundant with the arrival month, date, and year.

In [118]:
customer_df2 = customer_df.drop("Booking_ID")
hotel_df2 = hotel_df.drop("arrival_date_week_number")

### Columns that need transformation before merger

#### Booking status
`booking_status`, as revealed from the EDA analysis, looks to be equivalent for both, save that the values are stored under different conventions as given by the following table:

| customer_df  | hotel_df |
| -----------  | -------- |
| Not_Canceled |     0    |
|  Canceled    |     1    |

I discerned this from how in `customer_df`, about one third of the reservations were canceled.
Likewise, about one third of the reservations in `hotel_df` have `1` and the other two thirds `0`,
leading me to assume the above mapping.

In the merged data, I will convert these into a column called `canceled`, where `0` is "not canceled" and `1` is "canceled".

In [119]:
customer_df2 = customer_df2.withColumn(
    "canceled",
    fn.when(fn.col("booking_status") == "Not_Canceled", 0)
    .when(fn.col("booking_status") == "Canceled", 1)).drop("booking_status")

hotel_df2 = hotel_df2.withColumnRenamed("booking_status", "canceled")

#### Arrival Time
The arrival time is present in both, but with some name and storage type differences.

- `arrival_date` in `customer_df` is `arrival_date_day_of_month` in `hotel_df`. 
- `arrival_month` is stored as an integer 1-12 in `customer_df` but as the full month's name in `hotel_df`.

In the merged data, arrival time will be encoded as a single column called `arrival_time` that will store the date of arrival as a `DateType` in PySpark. This will replace the current month, day, and year columns present in both dataframes.

In [120]:
customer_df2 = (customer_df2
 .withColumn("arrival_time",
             fn.make_date(
                 fn.col("arrival_year"),
                 fn.col("arrival_month"),
                 fn.col("arrival_date")))
 .drop("arrival_year")
 .drop("arrival_month")
 .drop("arrival_date"))


def map_mongth_name_to_number(column_name: str):
    month_name_to_number = {
        "February": 2,
        "March": 3,
        "April": 4,
        "May": 5,
        "June": 6,
        "July": 7,
        "August": 8,
        "September": 9,
        "October": 10,
        "November": 11,
        "December": 12
    }
    mapping = fn.when(fn.col(column_name) == "January", 1)
    for name, num in month_name_to_number.items():
        mapping = mapping.when(fn.col(column_name) == name, num)
    return mapping

hotel_df2 = (hotel_df2
 .withColumn("arrival_month",map_mongth_name_to_number("arrival_month"))
 .withColumn("arrival_time",
             fn.make_date(
                 fn.col("arrival_year"),
                 fn.col("arrival_month"),
                 fn.col("arrival_date_day_of_month")))
 .drop("arrival_year")
 .drop("arrival_month")
 .drop("arrival_date_day_of_month"))

### Filling in Missing Data

The `country`, `hotel`, and `email` are present in `hotel_df` but not `customer_df`.
These columns may be interesting for our analysis, so I will keep them in the merged data.
For the rows that come from `customer_df`, I will insert Null values for these three columns
in the merged data.

In [121]:
# Null values will automatically be inserted for the rows from customer_df when
# the two dataframes are merged

### Merging the data

In [123]:
merged_data = hotel_df2.unionByName(customer_df2, allowMissingColumns=True)

merged_data.describe().show()
merged_data.printSchema()

+-------+------------+-------------------+-----------------+-----------------------+--------------------+-------------------+-------+------------------+--------------------+
|summary|       hotel|           canceled|        lead_time|stays_in_weekend_nights|stays_in_week_nights|market_segment_type|country|avg_price_per_room|               email|
+-------+------------+-------------------+-----------------+-----------------------+--------------------+-------------------+-------+------------------+--------------------+
|  count|       78703|             114978|           114978|                 114978|              114978|             114978|  78298|            114978|               78703|
|   mean|        NULL| 0.3510584633582077|96.22974829967472|     0.8745499138965716|   2.371088382125276|               NULL|   NULL| 97.80159865365225|                NULL|
| stddev|        NULL|0.47730325797274625|100.5264041802159|     0.9546293077588424|  1.7431996147326994|               NULL|   NU