#Data Pipeline for PriceMyRental

Import pandas, and the get_data function from get_data.py that will retrieve and parse the data

In [20]:
import pandas as pd
import numpy as np
from get_data import get_data

Set the start and end dates that we want to pull data between, and call get_data, saving the result to a pandas dataframe. We set one_city==True so that we'll only retrieve data for San Francisco. This should take roughly 30 minutes. Set print_urls==True if you want to see the url of each JSON as it's being parsed. 

In [81]:
#These are commented out so they're not run inadvertently 
# start_date = date(2014, 9, 29)
# end_date = date(2015, 6, 17)
# get_data(start_date, end_date, one_city=True, print_urls=False)


In [None]:
# Load the csv saved by get_data
raw_df = pd.read_csv('data/USA-SFO-SNF_USA-CA.csv', header = False)

In [5]:
# Filter the data for listings from the City of San Francisco specifically
sfdf = raw_df[(raw_df['region']=='sfc')]

# Save a csv of this data for convenience
sfdf.to_csv('data/sf_raw.csv', index=False, encoding='utf-8')

In [2]:
# Loading the sf_raw csv  
sfdf = pd.read_csv('data/sf_raw.csv', header=False)

In [14]:
#Get list of top neighborhoods in San Francisco
nhoods = list((sfdf.neighborhood.value_counts()[:39]).keys())

In [15]:
#Remove erroneous neighborhood labels
nhoods.remove('San Francisco')
nhoods.remove('San Francisco, CA')
nhoods.remove('all neighborhoods')

In [16]:
nhoods

['SOMA / south beach',
 'pacific heights',
 'mission district',
 'nob hill',
 'downtown / civic / van ness',
 'marina / cow hollow',
 'lower nob hill',
 'russian hill',
 'ingleside / SFSU / CCSF',
 'sunset / parkside',
 'lower pac hts',
 'castro / upper market',
 'richmond / seacliff',
 'noe valley',
 'inner sunset / UCSF',
 'potrero hill',
 'inner richmond',
 'north beach / telegraph hill',
 'financial district',
 'hayes valley',
 'tenderloin',
 'alamo square / nopa',
 'bernal heights',
 'laurel hts / presidio',
 'glen park',
 'twin peaks / diamond hts',
 'excelsior / outer mission',
 'cole valley / ashbury hts',
 'USF / panhandle',
 'bayview',
 'haight ashbury',
 'west portal / forest hill',
 'portola district',
 'lower haight',
 'western addition',
 'visitacion valley']

In [22]:
condition = sfdf['neighborhood'].isin(nhoods)

In [23]:
condition

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
208572     True
208573     True
208574     True
208575     True
208576     True
208577     True
208578     True
208579     True
208580     True
208581    False
208582     True
208583     True
208584     True
208585     True
208586     True
Name: neighborhood, Length: 208587, dtype: bool

In [24]:
sfdf = sfdf[condition]

In [59]:
dup_condition = sfdf.duplicated('id')
dup_df = sfdf[dup_condition]
dup_df = dup_df.sort('id')

In [61]:
dup_df.sort('date', ascending=False)

Unnamed: 0,baths,beds,body,date,heading,id,lat,long,neighborhood,parking,price,region,washer_dryer
149192,,0,Rentsfnow | Megan\n It doesn`t get much better...,2015-02-22 07:45:18,Grand Studio in Coveted Neighborhood Pets Wel...,1849941780,37.791097,-122.408913,nob hill,3,2695,sfc,1
149193,,1,Megan\n 1 BR | $0 Deposit\n $2995 | 969 Bush #...,2015-02-22 07:44:59,Unique 1 BR in Nob Hill Still NO Deposit,1849941641,37.789594,-122.413222,nob hill,0,2995,sfc,0
149194,,1,Megan\n 1 BR | $0 Deposit\n $2995 | 969 Bush #...,2015-02-22 07:44:16,HUGE 1 BR - can be used as 2 Available NOW ...,1849941730,37.789594,-122.413222,nob hill,0,2995,sfc,0
149195,1,6,Coming available March 15th to April 1st is a ...,2015-02-22 07:36:47,"Newly remodeled, spacious, view, laundry, hard...",1849942009,37.777753,-122.438872,western addition,1,8450,sfc,1
149196,1,1,"Coming available February 1st is a large, rece...",2015-02-22 07:36:40,"LOCATION: laundry, large, heart of Castro:",1849941940,37.759536,-122.434795,castro / upper market,1,3250,sfc,1
149197,1,3,2nd floor remodeled unit located in one of the...,2015-02-22 07:27:36,Remodeled 2bedroomsSunroom unit Facing GG park,1849941606,37.771542,-122.506685,richmond / seacliff,4,3500,sfc,0
149198,,1,Furnished OR Unfurnished - Ideal for 1 only - ...,2015-02-22 07:21:06,1br.-- TOP FLOOR BAY VIEWS,1849927257,37.800200,-122.409100,north beach / telegraph hill,0,3300,sfc,0
149199,,1,"FOR RENT-Furnished OR Unfurnished, a One Bedro...",2015-02-22 07:14:30,TELEGRAPH HILL GRAND VIEW,1849916301,37.800200,-122.409100,north beach / telegraph hill,0,3300,sfc,0
149200,1,2,"This is a cozy and bright two bedroom, one bat...",2015-02-22 06:53:22,Charming two bedroom with panoramic ocean view,1849890831,37.744100,-122.486300,sunset / parkside,4,3200,sfc,0
149201,1,2,No Fee Brokers\n San Francisco - Pacific Heigh...,2015-02-22 06:43:20,"2br Pacific Heights Classic Flat, deck, pkg - ...",1849863858,37.796063,-122.428538,pacific heights,0,5500,sfc,0


In [62]:
sf_deduped = sfdf.copy()
sf_deduped = sf_deduped.sort('date')
sf_deduped = sf_deduped.drop_duplicates('id')
sf_deduped = sf_deduped.drop_duplicates('body')

In [63]:
len(sf_deduped)

81087

In [64]:
sf_deduped.neighborhood.value_counts()

SOMA / south beach              11664
mission district                 5189
lower nob hill                   4084
pacific heights                  3968
nob hill                         3941
downtown / civic / van ness      3750
marina / cow hollow              3727
sunset / parkside                3267
noe valley                       2740
ingleside / SFSU / CCSF          2639
inner sunset / UCSF              2541
castro / upper market            2525
lower pac hts                    2502
richmond / seacliff              2494
russian hill                     2448
potrero hill                     2173
inner richmond                   2002
financial district               1977
hayes valley                     1944
north beach / telegraph hill     1886
alamo square / nopa              1473
bernal heights                   1208
excelsior / outer mission        1128
tenderloin                       1089
bayview                          1055
twin peaks / diamond hts          982
laurel hts /