In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import glob
from datetime import datetime as dt
%matplotlib inline

from sklearn.model_selection import train_test_split

pd.set_option("display.max_colwidth",999)
pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## for all csv's in the SF folder, concatenate into one dataframe

In [2]:
files = glob.glob('SF/*.gz')
df_sf = pd.concat([pd.read_csv(fp, parse_dates = ['calendar_last_scraped','last_scraped', 'first_review', 'last_review']) for fp in files], ignore_index=True)

  if self.run_code(code, result):


In [3]:
len(df_sf)

204559

## Drop the rows where the number of reviews is greater than 0 and the reviews per month is zero

In [4]:
df_sf = df_sf[(df_sf['reviews_per_month'].notnull()) | ((df_sf['number_of_reviews']== 0) & (df_sf['reviews_per_month'].isnull()))]

## fill 0 in the reviews per month where total reviews = 0

In [5]:
df_sf.reviews_per_month.fillna(value=0, inplace=True)

## drop listings where price is 0

In [6]:
df_sf['price'].replace('[\$,]', '', regex=True, inplace=True)
df_sf['price'] = df_sf.price.astype(float) ##
df_sf.drop(df_sf[df_sf.price == 0].index, inplace=True)

In [7]:
len(df_sf)

204235

## create year and month columns

In [8]:
df_sf['year'] = df_sf.calendar_last_scraped.apply(lambda x: x.year)
df_sf['month'] = df_sf.calendar_last_scraped.apply(lambda x: x.month)

## Fill nulls

In [9]:
df_sf.cancellation_policy.fillna(value='none', inplace=True)
df_sf.cleaning_fee.fillna(value='$0.00', inplace=True)

#do host fill out information?
df_sf.loc[df_sf.host_about.notnull(), 'host_about_filled'] = True
df_sf.loc[df_sf.host_about.isnull(), 'host_about_filled'] = False
df_sf.loc[df_sf.host_picture_url.notnull(), 'host_picture_url_filled'] = True
df_sf.loc[df_sf.host_picture_url.isnull(), 'host_picture_url_filled'] = False
df_sf.loc[df_sf.access.notnull(), 'access_filled'] = True
df_sf.loc[df_sf.access.isnull(), 'access_filled'] = False
df_sf.loc[df_sf.house_rules.notnull(), 'house_rules_filled'] = True
df_sf.loc[df_sf.house_rules.isnull(), 'house_rules_filled'] = False
df_sf.loc[df_sf.space.notnull(), 'space_filled'] = True
df_sf.loc[df_sf.space.isnull(), 'space_filled'] = False

df_sf['extra_people'].replace('[\$,]', '', regex=True, inplace=True)

df_sf.property_type.fillna(value = 'Other', inplace=True) ##

#bedroom average is 1. 83 nulls. 
df_sf.bedrooms.fillna(value = 1, inplace=True) ##
df_sf.bedrooms.replace(0, 1, inplace=True) ##

df_sf.beds.fillna(value = 1, inplace=True) ##

df_sf['log_price'] = df_sf['price'].apply(np.log)

#value of the place you are booking. take log to get 
df_sf['price_per_bedroom'] = df_sf['log_price']/df_sf['bedrooms'] ##

#how much each person pays
df_sf['price_per_guest'] = df_sf['log_price']/(df_sf['guests_included']+1) ##adding one to included booker

#do you get your own bed?
df_sf['guest_per_bed'] = df_sf['guests_included']/df_sf['beds'] ##

#do you get your own bedroom?
df_sf['guest_per_bedroom'] = df_sf['guests_included']/df_sf['bedrooms'] ##

In [20]:
df_sf.property_type.value_counts()

Apartment             117960
House                  59201
Condominium            12083
Loft                    3241
Other                   2372
Bed & Breakfast         1922
Townhouse               1445
Dorm                    1385
Boutique hotel           825
In-law                   639
Guest suite              599
Timeshare                560
Guesthouse               388
Camper/RV                282
Bungalow                 267
Hostel                   227
Cabin                    214
Boat                     141
Entire Floor              90
Villa                     60
Treehouse                 58
Castle                    46
Yurt                      28
Tent                      27
Cave                      24
Lighthouse                21
Serviced apartment        21
Floor                     20
Earth House                8
Hotel                      8
Island                     3
Pension (Korea)            3
Casa particular            2
Plane                      2
Chalet        

In [10]:
df_sf['price_per_guest'].isnull().sum()

0

## 2017 dataframe

In [11]:
df_sf_2017 = df_sf[df_sf['year']==2017]

## 2.57 is the cutoff reviews per month to be in the top 20% of listings for all of SF in 2017

In [10]:
# df_sf.loc[df_sf.reviews_per_month >= 2.54, 'popular'] = True
# df_sf.loc[df_sf.reviews_per_month < 2.54, 'popular'] = False

In [12]:
df_sf_2017.loc[df_sf_2017.reviews_per_month >= 2.57, 'popular'] = True
df_sf_2017.loc[df_sf_2017.reviews_per_month < 2.57, 'popular'] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [11]:
df_sf_2017.popular.value_counts(1)

False    0.792064
True     0.207936
Name: popular, dtype: float64

## pickle the two dataframes

In [71]:
# df_sf.to_pickle('data_sf.p')
# df_sf_2017.to_pickle('data_sf_2017.p')

## Should instead save to json file instead of pickle flie

In [13]:
df_sf_2017.to_json('df_sf_2017',orient='columns')

In [14]:
!ls -l

total 747768
-rw-rw-r-- 1 ubuntu ubuntu     14960 Aug  2 20:18 data_cleaning.ipynb
-rw-rw-r-- 1 ubuntu ubuntu 650806844 Aug  2 20:19 df_sf_2017
-rw-rw-r-- 1 ubuntu ubuntu    363588 Aug  2 00:33 EDA_SF_2017.ipynb
-rw-rw-r-- 1 ubuntu ubuntu   4797091 Aug  2 18:26 EDA_top_20.ipynb
-rw-rw-r-- 1 ubuntu ubuntu    410364 Aug  2 19:08 nb_model0.p
-rw-rw-r-- 1 ubuntu ubuntu    401756 Aug  2 19:11 nb_model1.p
-rw-rw-r-- 1 ubuntu ubuntu    410364 Aug  1 20:24 nb_model_2.p
-rw-rw-r-- 1 ubuntu ubuntu    403388 Aug  2 19:14 nb_model2.p
-rw-rw-r-- 1 ubuntu ubuntu    401148 Aug  2 19:16 nb_model3.p
-rw-rw-r-- 1 ubuntu ubuntu    406524 Aug  2 19:19 nb_model4.p
-rw-rw-r-- 1 ubuntu ubuntu    410716 Aug  2 19:21 nb_model5.p
-rw-rw-r-- 1 ubuntu ubuntu    404540 Aug  2 19:24 nb_model6.p
-rw-rw-r-- 1 ubuntu ubuntu    409372 Aug  2 19:27 nb_model7.p
-rw-rw-r-- 1 ubuntu ubuntu    410204 Aug  2 19:29 nb_model8.p
-rw-rw-r-- 1 ubuntu ubuntu    410295 Aug  1 20:44 nb_model_guassian.p
-rw-rw-r-- 1 u

In [12]:
df_sf_2017 = pd.read_json('df_sf_2017')

In [13]:
df_sf_2017.price_per_bedroom.isnull().sum()

0