In [None]:
"""
Data Quality Assessment
Shows the percentage of dirty data in the raw database.
"""
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, '../..')

from lib.db import init_db
from lib.data_validator import check_data_quality, validate_and_clean


In [None]:
# Load raw data
con = init_db()


Loaded ds_booked_rooms.csv into table 'booked_rooms' with type casting
Loaded ds_bookings.csv into table 'bookings' with type casting
Loaded ds_hotel_location.csv into table 'hotel_location' with type casting
Loaded ds_rooms.csv into table 'rooms' with type casting


We're gonna check some basic logical tests about the data. 

In [3]:
# Check data quality
results = check_data_quality(con)

print(f"\n{'='*60}")
print("DATA QUALITY REPORT")
print(f"{'='*60}\n")

print(f"Total Checks: {results['total_checks']}")
print(f"Checks Passed: {results['checks_passed']}")
print(f"Checks Failed: {results['total_checks'] - results['checks_passed']}")
print(f"\nTotal Problematic Rows: {results['total_failed']:,}\n")

print(f"{'Rule':<30} {'Failed':<12} {'Total':<12} {'%':<8}")
print("-" * 60)

for r in results['rules']:
    if r['failed'] > 0:
        print(f"{r['name']:<30} {r['failed']:<12,} {r['total']:<12,} {r['pct']:<8.2f}")



DATA QUALITY REPORT

Total Checks: 16
Checks Passed: 4
Checks Failed: 12

Total Problematic Rows: 45,159

Rule                           Failed       Total        %       
------------------------------------------------------------
Negative Price                 3            1,209,677    0.00    
Zero Price                     12,464       1,209,677    1.03    
NULL Price                     968          1,209,677    0.08    
Extreme Price (>5k/night)      7            870,503      0.00    
NULL Dates                     991          1,027,495    0.10    
Negative Lead Time             10,404       1,027,495    1.01    
NULL Occupancy                 968          1,209,677    0.08    
Overcrowded Room               11,282       1,209,677    0.93    
NULL Room ID                   6,104        1,209,677    0.50    
NULL Booking ID                968          1,209,677    0.08    
Orphan Bookings                998          1,027,495    0.10    
Cancelled but Active           2        

 We can use about 95% of the data for our analysis.

 In a real world scenario I would want to dig more in depth into the 5% to see what we can save, but for the purposes of this I'm going to make the assumption that it's all due to data quality issues.

In [4]:
booked_rooms = con.execute("SELECT * FROM booked_rooms").fetchdf()
rooms = con.execute("SELECT * FROM rooms").fetchdf()
bookings = con.execute("SELECT * FROM bookings").fetchdf()
hotel_location = con.execute("SELECT * FROM hotel_location").fetchdf()

In [5]:
# check nulls or empty strings per column in each df
print("Booked Rooms Null Pct: ", booked_rooms.isnull().mean() * 100)
print("Booked Rooms Empty String Pct: ", booked_rooms.map(lambda x: 1 if x == '' else 0).mean() * 100)
print("-"*50)
print("Rooms Null Pct: ", rooms.isnull().mean() * 100)
print("Rooms Empty String Pct: ", rooms.map(lambda x: 1 if x == '' else 0).mean() * 100)
print("-"*50)
print("Bookings Null Pct: ", bookings.isnull().mean() * 100)
print("Bookings Empty String Pct: ", bookings.map(lambda x: 1 if x == '' else 0).mean() * 100)
print("-"*50)
print("Hotel Location Null Pct: ", hotel_location.isnull().mean() * 100)
print("Hotel Location Empty String Pct: ", hotel_location.map(lambda x: 1 if x == '' else 0).mean() * 100)
print("-"*50)

Booked Rooms Null Pct:  id                0.080021
booking_id        0.080021
total_adult       0.080021
total_children    0.080021
room_id           0.504598
room_size         0.080021
room_view         0.103747
room_type         0.080021
total_price       0.080021
dtype: float64
Booked Rooms Empty String Pct:  id                 0.00000
booking_id         0.00000
total_adult        0.00000
total_children     0.00000
room_id            0.00000
room_size          0.00000
room_view         36.35177
room_type          0.00000
total_price        0.00000
dtype: float64
--------------------------------------------------
Rooms Null Pct:  id                              0.0
number_of_rooms                 0.0
max_occupancy                   0.0
max_adults                      0.0
pricing_per_person_activated    0.0
events_allowed                  0.0
pets_allowed                    0.0
smoking_allowed                 0.0
children_allowed                0.0
dtype: float64
Rooms Empty String Pc

 Takeaways:

 - Room view is empty when there's no view. We'll replace empty strings with 'No view' and then convert to categorical.

 - When there's no lat/long, it's an empty string. We'll replace them with nulls.

 - Hotel location has empty strings for some values, we'll replace them with nulls.

 I'll make the changes in the data validator class and move on to the EDA.

In [6]:
# Let's also check to see if there are any columns that only have one value, we can drop them. 
print("Booked Rooms Unique Values: ", booked_rooms.nunique())
print("Rooms Unique Values: ", rooms.nunique())
print("Bookings Unique Values: ", bookings.nunique())
print("Hotel Location Unique Values: ", hotel_location.nunique())

Booked Rooms Unique Values:  id                1208709
booking_id        1026497
total_adult            40
total_children         20
room_id             10178
room_size             288
room_view              11
room_type               5
total_price         72091
dtype: int64
Rooms Unique Values:  id                              92912
number_of_rooms                    53
max_occupancy                      83
max_adults                         82
pricing_per_person_activated        2
events_allowed                      1
pets_allowed                        1
smoking_allowed                     1
children_allowed                    1
dtype: int64
Bookings Unique Values:  id                1027495
status                  4
total_price         88835
created_at        1027494
cancelled_date        534
source                  4
arrival_date         1033
departure_date       1022
payment_method          5
cancelled_by            2
hotel_id             2284
dtype: int64
Hotel Location Unique V

Takeaways:
- pets_allowed and smoking_allowed only have one value. We can drop these columns entirely.
- However, we do have booked_rooms.total_children. If there is >=1 booking of a room_id that has >=1 child, then we can impute "TRUE" for the rooms.children_allowed column.
- We can also do the same for reception halls. Events are definitely allowed for reception halls, so we can fix that feature. 

In [8]:

con_raw = init_db()
con = validate_and_clean(con_raw)

df_rooms_clean = con.execute("SELECT * FROM rooms").fetchdf()
df_booked_rooms_clean = con.execute("SELECT * FROM booked_rooms").fetchdf()
df_bookings_clean = con.execute("SELECT * FROM bookings").fetchdf()
df_hotel_location_clean = con.execute("SELECT * FROM hotel_location").fetchdf()

print("Original df children distribution: ",rooms['children_allowed'].value_counts())
print("New df children distribution: ",df_rooms_clean['children_allowed'].value_counts())
print("-"*50)
print("Original df events distribution: ",rooms['events_allowed'].value_counts())
print("New df events distribution: ",df_rooms_clean['events_allowed'].value_counts())
print("-"*50)


Loaded ds_booked_rooms.csv into table 'booked_rooms' with type casting
Loaded ds_bookings.csv into table 'bookings' with type casting
Loaded ds_hotel_location.csv into table 'hotel_location' with type casting
Loaded ds_rooms.csv into table 'rooms' with type casting
Original df children distribution:  children_allowed
False    92912
Name: count, dtype: int64
New df children distribution:  children_allowed
False    85972
True      6940
Name: count, dtype: int64
--------------------------------------------------
Original df events distribution:  events_allowed
False    92912
Name: count, dtype: int64
New df events distribution:  events_allowed
False    92883
True        29
Name: count, dtype: int64
--------------------------------------------------


Let's check to see if there are any bookings that are missing location data.

6800 of them. Do they correspond to bookings?

 Not that many. We'll add a parameter to the data_validator to exclude bookings from hotels with missing location data.