### 1. Load data

In [1]:
import pandas as pd

In [2]:
la_listings = pd.read_csv('../Predicting_Airbnb_Prices/la_airbnb_may_2017.csv')
print(la_listings.shape)

la_listings.head()

(31253, 21)


Unnamed: 0,id,host_id,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,beds,...,weekly_price,monthly_price,security_deposit,cleaning_fee,minimum_nights,maximum_nights,number_of_reviews,review_scores_rating,instant_bookable,cancellation_policy
0,16228948,54345288,34.53035,-118.227088,House,Entire home/apt,10,7.0,5.0,5.0,...,,,2000.0,200.0,1,1125,0,,t,strict
1,8909380,44800067,34.485431,-118.12538,House,Private room,2,1.0,1.0,1.0,...,,,,,1,1125,33,93.0,f,flexible
2,14078522,84615808,34.51013,-118.212407,House,Private room,2,1.0,1.0,1.0,...,,,,,1,1125,14,100.0,f,flexible
3,13006928,71743973,34.515486,-118.230742,Other,Entire home/apt,6,1.0,1.0,3.0,...,,,,35.0,1,2,22,100.0,t,flexible
4,7898757,41646908,34.031419,-118.299016,Apartment,Private room,1,1.0,1.0,1.0,...,,,,5.0,1,90,3,93.0,f,flexible


### 2. Split data into features/target, train/test

In [3]:
la_listings.columns

Index(['id', 'host_id', 'latitude', 'longitude', 'property_type', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'weekly_price', 'monthly_price', 'security_deposit', 'cleaning_fee',
       'minimum_nights', 'maximum_nights', 'number_of_reviews',
       'review_scores_rating', 'instant_bookable', 'cancellation_policy'],
      dtype='object')

In [4]:
features = la_listings.drop('price', axis=1)

In [5]:
# Drop more features because data size is too big, keeps crashing kernel

print(features.shape)
features.drop('id', axis=1, inplace=True)
features.drop('host_id', axis=1, inplace=True)
features.drop('latitude', axis=1, inplace=True)
features.drop('longitude', axis=1, inplace=True)
features.drop('security_deposit', axis=1, inplace=True)
features.drop('cleaning_fee', axis=1, inplace=True)
features.drop('minimum_nights', axis=1, inplace=True)
features.drop('maximum_nights', axis=1, inplace=True)
features.drop('property_type', axis=1, inplace=True)
print(features.shape)

(31253, 20)
(31253, 11)


In [6]:
features.columns

Index(['room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds',
       'weekly_price', 'monthly_price', 'number_of_reviews',
       'review_scores_rating', 'instant_bookable', 'cancellation_policy'],
      dtype='object')

In [7]:
target = la_listings['price']

### 3. Clean data (handle NaNs, pd.get_dummies & scale on categorical features)

Handle NaNs

In [8]:
features.shape

(31253, 11)

In [9]:
for col in features.columns:
    if features[col].isnull().sum() > 500:
        percentage = features[col].isnull().sum()/31253
        print("{0:20} {1}% null".format(col, round(percentage,2)))

weekly_price         0.76% null
monthly_price        0.79% null
review_scores_rating 0.24% null


In [10]:
for col in features.columns:
    if features[col].isnull().sum() > 0:
        features[col].fillna(0, inplace=True)

Create numerical and categorical dataframes (for pd.get_dummies & scale data)

In [11]:
x = features.dtypes
data_types = pd.Series(x.values)
data_types.value_counts()

float64    6
object     3
int64      2
dtype: int64

In [12]:
# Examine categorical data
for col in features.columns:
    if features[col].dtypes == 'object':
        print(col)

room_type
instant_bookable
cancellation_policy


In [13]:
print(len(features['room_type'].unique()))
print(len(features['instant_bookable'].unique()))
print(len(features['cancellation_policy'].unique()))

3
2
5


In [14]:
# Examine numerical data
for col in features.columns:
    if features[col].dtypes == 'int64':
        print(col)

accommodates
number_of_reviews


In [15]:
# Examine numerical data
for col in features.columns:
    if features[col].dtypes == 'float64':
        print(col)

bathrooms
bedrooms
beds
weekly_price
monthly_price
review_scores_rating


In [16]:
num_list = []
cat_list = []

for col in features.columns:
    if features[col].dtype == 'float64' or features[col].dtype == 'int64':
        num_list.append(col)
    if features[col].dtype == 'object':
        cat_list.append(col)
    else: 
        pass

In [17]:
num_list

['accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'weekly_price',
 'monthly_price',
 'number_of_reviews',
 'review_scores_rating']

In [18]:
cat_list

['room_type', 'instant_bookable', 'cancellation_policy']

In [19]:
print(features.shape)
features_dummies = pd.get_dummies(features)
print(features_dummies.shape)

(31253, 11)
(31253, 18)


In [20]:
features_dummies.head(2)

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,weekly_price,monthly_price,number_of_reviews,review_scores_rating,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,instant_bookable_f,instant_bookable_t,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,10,7.0,5.0,5.0,0.0,0.0,0,0.0,1,0,0,0,1,0,0,1,0,0
1,2,1.0,1.0,1.0,0.0,0.0,33,93.0,0,1,0,1,0,1,0,0,0,0


In [21]:
# Always scale your dummies
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features_dummies), columns=features_dummies.columns)

In [23]:
features_scaled.head(2)

Unnamed: 0,accommodates,bathrooms,bedrooms,beds,weekly_price,monthly_price,number_of_reviews,review_scores_rating,room_type_Entire home/apt,room_type_Private room,room_type_Shared room,instant_bookable_f,instant_bookable_t,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60
0,2.648892,6.772566,3.680124,2.041956,-0.333036,-0.310263,-0.546962,-1.749172,0.858347,-0.769483,-0.234953,-1.63188,1.63188,-0.666201,-0.576303,1.122771,-0.005657,-0.029945
1,-0.574463,-0.453976,-0.351349,-0.580491,-0.333036,-0.310263,0.319043,0.525217,-1.165029,1.299574,-0.234953,0.61279,-0.61279,1.501049,-0.576303,-0.890654,-0.005657,-0.029945


In [24]:
features_scaled.shape

(31253, 18)

### Export data

In [25]:
from sklearn.externals import joblib

In [26]:
joblib.dump(features, 'features_og.pickle')

['features_og.pickle']

In [27]:
joblib.dump(features_scaled, 'features_transformed.pickle')

['features_transformed.pickle']

In [28]:
joblib.dump(target, 'target.pickle')

['target.pickle']