# Feature Engineering Notebook

Imports

In [1]:
import types
import re

from botocore.client import Config
import ibm_boto3
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler
import pandas as pd
import pickle

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Loading data from object storage

In [2]:
# The code was removed by Watson Studio for sharing.

In [3]:
def download_file_cos(credentials, local_file_name, key):  
    cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['ENDPOINT'])
    try:
        res=cos.download_file(Bucket=credentials['BUCKET'], Key=key, Filename=local_file_name)
    except Exception as e:
        print(Exception, e)
    else:
        print('File Downloaded')

In [4]:
download_file_cos(credentials=credentials, local_file_name='df_raw_pickle.pickle', key='df_raw_pickle.pickle')
download_file_cos(credentials=credentials, local_file_name='dtypes_dict.pickle', key='dtypes_dict.pickle')

File Downloaded
File Downloaded


In [5]:
df = pd.read_pickle('df_raw_pickle.pickle')

with open('dtypes_dict.pickle', 'rb') as pkl:
    dtypes_dict = pickle.load(pkl)

### NA values
Fill N/A values with `-1`

In [6]:
df.season_holidayed_code.fillna(-1, inplace = True)
df.state_code_residence.fillna(-1, inplace = True)

### Correct the data types

Convert categorical features to `category` data type

In [7]:
for col in dtypes_dict['category_feats']:
    df.loc[:, col] = df.loc[:, col].astype('category')

In [8]:
# Check the data types of the featuers
print(df.dtypes.astype('str').sort_values())

reservation_id                              category
reservationstatusid_code                    category
cluster_code                                category
memberid                                    category
booking_type_code                           category
member_age_buckets                          category
state_code_resort                           category
resort_id                                   category
resort_type_code                            category
room_type_booked_code                       category
persontravellingid                          category
main_product_code                           category
channel_code                                category
resort_region_code                          category
checkout_date                         datetime64[ns]
checkin_date                          datetime64[ns]
booking_date                          datetime64[ns]
season_holidayed_code                        float64
state_code_residence                         f

Add numerical features list to data types dictionary

In [9]:
dtypes_dict['num_feats'] = [x for x in df.columns if x not in dtypes_dict['date_feats']+dtypes_dict['category_feats']]

### Feature Engineering
#### Date Features

In [10]:
df['booking_in_advance'] = (df['checkin_date'] - df['booking_date']).dt.days
df['days_stayed'] = (df['checkout_date'] - df['checkin_date']).dt.days

In [11]:
# function taken from https://github.com/fastai/fastai/blob/master/old/fastai/structured.py
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):	
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [12]:
# Apply add_datepart function to datetime features
for col in dtypes_dict['date_feats']:
    add_datepart(df, col, drop=True)

#### Other Features

In [13]:
df['n_people'] = df['numberofadults'] + df['numberofchildren']

Revisit the data types

In [14]:
bin_feats = []

for col in df.columns:
    if len(df[col].unique()) == 2:
        bin_feats.append(col)

dtypes_dict['bin_feats'] = bin_feats

In [15]:
known_feats = sum(dtypes_dict.values(), [])

In [16]:
dtypes_dict['num_feats'] += [c for c in df.columns if c not in known_feats]

### Preprocessing

Converting binary-type columns values to [0, 1]

In [17]:
lb = LabelBinarizer()

for col in dtypes_dict['bin_feats']:
    df[col] = lb.fit_transform(df[col])

Label Encoding the categorical features

In [18]:
le = LabelEncoder()

for col in dtypes_dict['category_feats']:
    df[col] = le.fit_transform(df[col])

Scaling the numerical features

In [19]:
ss = StandardScaler()

for col in dtypes_dict['num_feats']:
    df[col] = ss.fit_transform(df[col].values.reshape(-1, 1))

### Data export

In [24]:
df.to_csv('df_processed.csv', header=True, index=False)

with open('df_processed_pickle.pickle', 'wb') as pkl:
    pickle.dump(df, pkl, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
cos.upload_file(Filename='df_processed_pickle.pickle', Bucket=credentials['BUCKET'], Key='df_processed_pickle.pickle')
cos.upload_file(Filename='df_processed.csv', Bucket=credentials['BUCKET'], Key='df_processed.csv')