# Expedia: Preprocessing

## Imports

In [7]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [8]:
## Reference: https://cloud.google.com/bigquery/docs/bigquery-storage-python-pandas
## Docs: https://googleapis.dev/python/bigquery/latest/index.html
import numpy as np
import pandas as pd

## Data

In [17]:
df = pd.read_csv("../data/train.csv")

In [21]:
# It seems like POS continent is missing for US and Canada
# I use NORTHAMERICA to fill that in as it is consistent with property Continent values
df.prop_continent.unique()
df[df.srch_posa_continent.isna()].srch_posa_country.unique()
df.loc[
    df.srch_posa_country.isin(["US", "CANADA"]), "srch_posa_continent"
] = "NORTHAMERICA"

array(['ASIA', 'EUROPE', 'NORTHAMERICA', 'LATAM'], dtype=object)

array([], dtype=object)

In [22]:
# This column indicates if the user is in the hcom loyalty program
# However, there seem to be 3 information points saved into one column
# I will split it up. Not sure what WR or FC stand for
df.srch_visitor_wr_member.unique()
split_col_names = ["signin_status", "wr_membership", "fc_membership"]
df[split_col_names] = df.srch_visitor_wr_member.str.split("|", expand=True)

array(['Signed in - Persistent|WR Member|Remembered FC Member', nan,
       'Not Signed In|Returning Visitor|Not FC Member',
       'Signed in - Persistent|WR Member|Not FC Member',
       'Not Signed In|New Visitor|Not FC Member',
       'Signed In|WR Member|Not FC Member',
       'Signed In|WR Member|FC Member',
       'Signed In|Not WR Member|Not FC Member',
       'Signed in - Persistent|Not WR Member|Not FC Member',
       'Signed in - Persistent|Not WR Member|Remembered FC Member'],
      dtype=object)

In [56]:
# Not sure how to impute missing data for wr_membership. When looking at visitor visit number,
# I see New visitors having more than 1 visit. The quantiles for missing rows seem to follow
# New Visitor or Not WR Member closely but it could be either of them.
# I use "Unkown" to impute missing values
df.groupby(split_col_names, dropna=False).srch_visitor_visit_nbr.describe()
df["wr_membership"] = df["wr_membership"].fillna("Unknown")
# Apart from nulls, most searches seems to have Not FC Member as the most common value
# I will use this to impute the missing fields
df.fc_membership.value_counts()
df["fc_membership"] = df["fc_membership"].fillna("Not FC Member")

# Assume user is not signed in if missing (as it is more likely)
df["signin_status"] = df["signin_status"].fillna("Not Signed In")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
signin_status,wr_membership,fc_membership,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Not Signed In,New Visitor,Not FC Member,90474.0,1.354754,3.437282,1.0,1.0,1.0,1.0,136.0
Not Signed In,Returning Visitor,Not FC Member,186379.0,13.298719,34.507061,1.0,3.0,5.0,12.0,1082.0
Not Signed In,Unknown,Not FC Member,444873.0,12.172031,31.601478,1.0,1.0,3.0,10.0,832.0
Signed In,Not WR Member,Not FC Member,540.0,7.811111,10.452537,1.0,1.0,1.0,8.0,31.0
Signed In,WR Member,FC Member,1077.0,18.650882,34.968131,1.0,2.0,8.0,18.0,169.0
Signed In,WR Member,Not FC Member,64355.0,16.313278,35.399157,1.0,1.0,5.0,16.0,725.0
Signed in - Persistent,Not WR Member,Not FC Member,379.0,20.738786,18.713458,1.0,4.0,21.0,41.0,64.0
Signed in - Persistent,Not WR Member,Remembered FC Member,148.0,3.324324,0.949158,2.0,2.0,4.0,4.0,4.0
Signed in - Persistent,WR Member,Not FC Member,49421.0,24.909714,40.162905,1.0,5.0,12.0,29.0,568.0
Signed in - Persistent,WR Member,Remembered FC Member,3410.0,10.965982,15.230545,1.0,2.0,5.0,13.0,145.0


Not FC Member           836421
Remembered FC Member      3558
FC Member                 1077
Name: fc_membership, dtype: int64

In [30]:
# 51 properties have no price data. Since this is not a lot of data, I will impute it with the median value
df[df.prop_price_with_discount_usd.isna()].prop_booking_bool.value_counts()
df = df.dropna(subset=["prop_price_with_discount_usd"])

Series([], Name: prop_booking_bool, dtype: int64)

Series([], Name: prop_booking_bool, dtype: int64)

0    16
1     2
Name: prop_booking_bool, dtype: int64

In [None]:
# Use median value across entire dataset for imputation (for now)
df[df.prop_review_count.isna()].prop_booking_bool.value_counts()
df[df.srch_adults_cnt.isna()].prop_booking_bool.value_counts()

In [34]:
# From the EDA report we know that srch_mobile_app, srch_visitor_wr_member
# and srch_currency have a lot of missing values. So I will drop these columns
df = df.drop(columns=["srch_mobile_app", "srch_visitor_wr_member", "srch_currency"])

In [46]:
# Some users from italy and spain don't have their region logged. We can impute that
# with the most frequent region
df[df.srch_visitor_loc_region.isna()].srch_visitor_loc_country.value_counts()

ITALY                     80
SPAIN & CANARY ISLANDS    43
Name: srch_visitor_loc_country, dtype: int64

In [11]:
# Convert date objects into datetime objects
date_cols = ["srch_date_time", "srch_ci", "srch_co", "srch_local_date"]
for col in date_cols:
    train_df[col] = pd.to_datetime(train_df[col])

# Convert string columns to categories
object_cols = train_df.dtypes[lambda x: x == "object"].index
train_df[object_cols] = train_df[object_cols].astype("category")

In [10]:
%load_ext watermark
%watermark -n -u -v -iv -w

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Last updated: Tue Jul 06 2021

Python implementation: CPython
Python version       : 3.8.8
IPython version      : 7.25.0

numpy  : 1.19.5
pandas : 1.2.4
sklearn: 0.23.2

Watermark: 2.2.0

