# 3. Preprocessing

1. Importing packages and dataset
2. Categorical to cardinal transformation
3. Feature scaling

## 3.1. Importing packages and dataset

In [1]:
# Import the required packages

import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import pre-processed dataset and load it into a dataframe

# Create path variable
original_file = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/interim/category_transformation.csv'

# Load csv in dataframe
dfo = pd.read_csv(original_file, index_col=0)

In [3]:
# Check # of columns and rows imported
dfo.shape

(78547, 37)

In [4]:
# Print info
dfo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78547 entries, 0 to 79329
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   LeadTime                     78547 non-null  float64
 1   ArrivalDateYear              78547 non-null  int64  
 2   ArrivalDateMonth             78547 non-null  object 
 3   ArrivalDateWeekNumber        78547 non-null  int64  
 4   ArrivalDateDayOfMonth        78547 non-null  int64  
 5   StaysInWeekendNights         78547 non-null  object 
 6   StaysInWeekNights            78547 non-null  object 
 7   Adults                       78547 non-null  float64
 8   Children                     78547 non-null  float64
 9   Babies                       78547 non-null  float64
 10  Meal                         78547 non-null  object 
 11  Country                      78547 non-null  object 
 12  MarketSegment                78547 non-null  object 
 13  DistributionChan

In [5]:
# Converts all non float variables in categorical
cat_cols = dfo.select_dtypes(exclude='float64').columns
dfo[cat_cols] = dfo[cat_cols].astype('category')

# Convert variables to categorical
dfo['Adults'] = dfo['Adults'].astype('category')
dfo['Children'] = dfo['Children'].astype('category')
dfo['Babies'] = dfo['Babies'].astype('category')

## 3.2. Sample selection

In [6]:
# Creates variable exclusion list for modeling
to_drop = ['ReservationDate', 'ReservationStatusDate', 'CheckOutDate', 
           'ArrivalDate', 'ArrivalDateWeekNumber', 'ArrivalDateDayOfMonth', 
           'ArrivalDateYear', 'AssignedRoomType', 'DaysInWaitingList', 'TotalStay']

# Removes features
dfo.drop(to_drop, axis=1, inplace=True)

In [7]:
X = dfo.drop(['ArrivalDateMonth'], axis=1)
y = dfo['ArrivalDateMonth']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=10)
split = sss.split(X, y)

for train_index, test_index in split:
    sample_df = dfo.iloc[test_index]

In [8]:
sample_df.shape

(15710, 27)

## 3.2. Categorical to cardinal transformation

In [9]:
# Get dummy variables for each categorical variable
encoded_df = pd.get_dummies(sample_df, drop_first=True)

# Check number of total variables
encoded_df.shape

(15710, 91)

## 3.3. Feature scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(encoded_df)
scaled = scaler.transform(encoded_df)

In [11]:
df_scaled = pd.DataFrame(scaled, columns=encoded_df.columns)

In [12]:
df_scaled.sample(10)

Unnamed: 0,LeadTime,ADR,ArrivalDateMonth_August,ArrivalDateMonth_December,ArrivalDateMonth_February,ArrivalDateMonth_January,ArrivalDateMonth_July,ArrivalDateMonth_June,ArrivalDateMonth_March,ArrivalDateMonth_May,...,ReservationMonth_December,ReservationMonth_February,ReservationMonth_January,ReservationMonth_July,ReservationMonth_June,ReservationMonth_March,ReservationMonth_May,ReservationMonth_November,ReservationMonth_October,ReservationMonth_September
13236,-0.944162,2.331531,2.799245,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899
11214,3.232629,0.178578,-0.357239,-0.234822,-0.258917,-0.222963,2.978975,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899
1940,-0.058176,0.498458,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,3.213503,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899
4615,-0.998406,-0.173214,-0.357239,-0.234822,3.862243,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,2.779888,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899
15704,-0.998406,0.367212,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,-0.296691,2.969638,-0.267899
116,-0.582535,-0.682758,-0.357239,-0.234822,3.862243,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,3.347638,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899
1794,0.936298,-0.008513,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,3.370511,-0.253983,-0.296691,-0.336741,-0.267899
7374,-0.853756,2.769019,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,3.004251,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,3.937271,-0.296691,-0.336741,-0.267899
7570,1.867488,0.367212,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,-0.311187,-0.209909,-0.296691,-0.253983,3.370511,-0.336741,-0.267899
14877,-0.383641,-1.125393,-0.357239,-0.234822,-0.258917,-0.222963,-0.335686,-0.332862,-0.297325,-0.339896,...,-0.298718,-0.359727,-0.400546,3.213503,-0.209909,-0.296691,-0.253983,-0.296691,-0.336741,-0.267899


---
The resulting dataframe is loaded in a csv file for further steps in a different notebook.

In [13]:
# Writing df dataframe to csv

path1 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/preprocessed.csv'
path2 = 'C:/Users/javie/OneDrive/Documents/springboard_projects/Capstone-3/data/processed/non_preprocessed.csv'

df_scaled.to_csv(path1)
sample_df.to_csv(path2)