# Expedia: EDA

## Imports

In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
## Reference: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
## Docs: https://googleapis.dev/python/bigquery/latest/index.html
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

## Data

In [3]:
df = pd.read_csv("../data/train.csv")

In [4]:
# Change the config when creating the report
profile = df.profile_report(title="Expedia Train Set Report", explorative=True)

profile.to_file("expedia_eda.html")

In [5]:
df.isna().sum()

srch_id                                   0
prop_key                                  0
srch_date_time                            0
srch_visitor_id                           0
srch_visitor_visit_nbr                    0
srch_visitor_loc_country                  0
srch_visitor_loc_region                 123
srch_visitor_loc_city                     0
srch_visitor_wr_member               444878
srch_posa_continent                  485248
srch_posa_country                         0
srch_hcom_destination_id                  0
srch_dest_longitude                       0
srch_dest_latitude                        0
srch_ci                                   0
srch_co                                   0
srch_ci_day                               0
srch_co_day                               0
srch_los                                  0
srch_bw                                   0
srch_adults_cnt                          18
srch_children_cnt                        18
srch_rm_cnt                     

In [6]:
# It seems like POS continent is missing for US and Canada
# I use NORTHAMERICA to fill that in as it is consistent with property Continent values
df.prop_continent.unique()
df[df.srch_posa_continent.isna()].srch_posa_country.unique()

array(['ASIA', 'EUROPE', 'NORTHAMERICA', 'LATAM'], dtype=object)

array(['US', 'CANADA'], dtype=object)

In [7]:
# This column indicates if the user is in the hcom loyalty program
# However, there seem to be 3 information points saved into one column
# I will split it up. Not sure what WR or FC stand for
df.srch_visitor_wr_member.unique()

array(['Signed in - Persistent|WR Member|Remembered FC Member', nan,
       'Not Signed In|Returning Visitor|Not FC Member',
       'Signed in - Persistent|WR Member|Not FC Member',
       'Not Signed In|New Visitor|Not FC Member',
       'Signed In|WR Member|Not FC Member',
       'Signed In|WR Member|FC Member',
       'Signed In|Not WR Member|Not FC Member',
       'Signed in - Persistent|Not WR Member|Not FC Member',
       'Signed in - Persistent|Not WR Member|Remembered FC Member'],
      dtype=object)

In [10]:
# A large part of the loyaltly feature have missing values
df.groupby("srch_visitor_wr_member", dropna=False).srch_visitor_visit_nbr.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
srch_visitor_wr_member,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Not Signed In|New Visitor|Not FC Member,90491.0,1.354754,3.437013,1.0,1.0,1.0,1.0,136.0
Not Signed In|Returning Visitor|Not FC Member,186401.0,13.297649,34.505205,1.0,3.0,5.0,12.0,1082.0
Signed In|Not WR Member|Not FC Member,540.0,7.811111,10.452537,1.0,1.0,1.0,8.0,31.0
Signed In|WR Member|FC Member,1077.0,18.650882,34.968131,1.0,2.0,8.0,18.0,169.0
Signed In|WR Member|Not FC Member,64361.0,16.31241,35.397737,1.0,1.0,5.0,16.0,725.0
Signed in - Persistent|Not WR Member|Not FC Member,379.0,20.738786,18.713458,1.0,4.0,21.0,41.0,64.0
Signed in - Persistent|Not WR Member|Remembered FC Member,148.0,3.324324,0.949158,2.0,2.0,4.0,4.0,4.0
Signed in - Persistent|WR Member|Not FC Member,49429.0,24.910356,40.164002,1.0,5.0,12.0,29.0,568.0
Signed in - Persistent|WR Member|Remembered FC Member,3411.0,10.969804,15.229947,1.0,2.0,5.0,13.0,145.0
,444878.0,12.17193,31.601318,1.0,1.0,3.0,10.0,832.0


In [11]:
# 51 properties have no price data. While I can impute the missing data,
# I choose to remove these data points as it is <0.1% of total observations
# and we miss no booked hotels.
df[df.prop_price_with_discount_usd.isna()].prop_booking_bool.value_counts()

0    51
Name: prop_booking_bool, dtype: int64

In [12]:
# Use median value across entire dataset for imputation (for now)
df[df.prop_review_count.isna()].prop_booking_bool.value_counts()
df[df.srch_adults_cnt.isna()].prop_booking_bool.value_counts()

0    6
1    2
Name: prop_booking_bool, dtype: int64

0    16
1     2
Name: prop_booking_bool, dtype: int64

In [13]:
# Some users from italy and spain don't have their region logged. We can impute that
# with the most frequent region
df[df.srch_visitor_loc_region.isna()].srch_visitor_loc_country.value_counts()

ITALY                     80
SPAIN & CANARY ISLANDS    43
Name: srch_visitor_loc_country, dtype: int64

In [20]:
# Null values in Room capacity seems to be filled as -9998
df.prop_room_capacity[lambda x: x <= 0].value_counts()

-9998    10401
 0        1770
Name: prop_room_capacity, dtype: int64

In [22]:
# Some rows dont have information about srch_visitor_loc_region
# We could have a dictionary of country/region set up to fill these missing values
# But I will drop it in the interest of time
df.srch_visitor_loc_region.isna().sum()

123

In [15]:
# Convert date objects into datetime objects
date_cols = ["srch_date_time", "srch_ci", "srch_co", "srch_local_date"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col])

In [29]:
# There seem to be some hotels that are badly rated. 
# I will make a separate feature for hotels with ratings less than 2
df.prop_review_score[lambda x: x < 2].value_counts()

0.000000    5809
1.000000     104
1.500000     100
1.900000      62
1.700000      45
1.400000      36
1.800000      33
1.600000      18
1.900000       3
1.833300       2
1.400000       2
1.600000       2
1.300000       1
1.900000       1
1.700000       1
1.800000       1
0.772727       1
1.800000       1
1.423389       1
1.600000       1
1.400000       1
1.900000       1
1.769658       1
1.800000       1
1.900000       1
1.700000       1
1.900000       1
Name: prop_review_score, dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 841115 entries, 0 to 841114
Data columns (total 47 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   srch_id                            841115 non-null  int64         
 1   prop_key                           841115 non-null  int64         
 2   srch_date_time                     841115 non-null  datetime64[ns]
 3   srch_visitor_id                    841115 non-null  category      
 4   srch_visitor_visit_nbr             841115 non-null  int64         
 5   srch_visitor_loc_country           841115 non-null  category      
 6   srch_visitor_loc_region            840992 non-null  category      
 7   srch_visitor_loc_city              841115 non-null  category      
 8   srch_visitor_wr_member             396237 non-null  category      
 9   srch_posa_continent                355867 non-null  category      
 10  srch_posa_country   

In [None]:
count_classes = pd.value_counts(df["prop_booking_bool"], sort=True)
sns.barplot(y=count_classes / count_classes.sum(), x=count_classes.index)
plt.title("Booking or Not booking")
plt.xlabel("Class")
plt.ylabel("Frequency");

In [None]:
correlation = df.corr()
plt.figure(figsize=(18, 18))
sns.heatmap(correlation, vmax=1, square=True, annot=True, cmap="viridis")
plt.title("Correlation between different fearures");

In [None]:
%load_ext watermark
%watermark -n -u -v -iv -w