# 8. CHICAGO AIRBNB FEATURE ENGINEERING
---

## 1. The Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

chicago = pd.read_csv('data/chicago_airbnb2')

num_cols = ['host_rr', 'host_ar', 'latitude', 'longitude', 'accommodates',
            'baths', 'bedrooms', 'beds', 'min_nights', 'max_nights', 'n_reviews', 
            'rs_rating', 'rs_accuracy', 'rs_cleanliness', 'rs_check_in', 
            'rs_communication', 'rs_location', 'rs_value', 'reviews_per_month']

cat_cols = ['host_rt', 'superhost', 'neighborhood', 'property_type', 'room_type', 
            'amenities', 'instant_bookable']

y_col = ['price']

train, test = train_test_split(chicago, test_size=0.2, random_state=42)

X_train = train.drop('price', axis=1)
y_train = train['price']

X_test = test.drop('price', axis=1)
y_test = test['price']

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5240, 26), (5240,), (1310, 26), (1310,))

## 2. New Numerical Attributes via Custom Transformation

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class AttributeAdder(BaseEstimator, TransformerMixin):    
    def __init__(self):
        self    
        
    def fit(self, df, y = None):               
        return self

    def transform(self, df):        
        df_ = df.copy()
        df_['beds_per_bdrm'] = df_['beds']/df_['bedrooms']
        df_['baths_per_bed'] = df_['baths']/df_['beds']
        return df_
    
adder = AttributeAdder()
X_train_num = adder.fit_transform(train[num_cols])
print(X_train_num.shape)
X_train_num.head()

(5240, 21)


Unnamed: 0,host_rr,host_ar,latitude,longitude,accommodates,baths,bedrooms,beds,min_nights,max_nights,n_reviews,rs_rating,rs_accuracy,rs_cleanliness,rs_check_in,rs_communication,rs_location,rs_value,reviews_per_month,beds_per_bdrm,baths_per_bed
3692,100.0,100.0,41.956,-87.666,8,1.0,4.0,4.0,1.0,120.0,82,98.0,10.0,9.0,9.0,9.0,10.0,9.0,4.7,1.0,0.25
1553,100.0,98.0,41.919,-87.69,6,1.0,2.0,3.0,2.0,1125.0,218,98.0,10.0,10.0,10.0,10.0,10.0,10.0,5.16,1.5,0.333
4338,100.0,100.0,41.95,-87.762,2,1.0,1.0,1.0,1.0,30.0,24,96.0,10.0,10.0,10.0,10.0,10.0,10.0,1.87,1.0,1.0
107,100.0,94.0,42.004,-87.667,2,0.75,1.0,2.0,2.0,90.0,233,98.0,10.0,10.0,10.0,10.0,10.0,10.0,2.59,2.0,0.375
29,100.0,86.0,41.688,-87.609,3,1.0,1.0,1.0,1.0,365.0,269,98.0,10.0,10.0,10.0,10.0,10.0,10.0,2.46,1.0,1.0


In [5]:
num_cols = X_train_num.columns
num_cols

Index(['host_rr', 'host_ar', 'latitude', 'longitude', 'accommodates', 'baths',
       'bedrooms', 'beds', 'min_nights', 'max_nights', 'n_reviews',
       'rs_rating', 'rs_accuracy', 'rs_cleanliness', 'rs_check_in',
       'rs_communication', 'rs_location', 'rs_value', 'reviews_per_month',
       'beds_per_bdrm', 'baths_per_bed'],
      dtype='object')

## 2. Scaling Numerical Attributes

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_num = train[num_cols].copy()
X_train_scaled = scaler.fit_transf