#Data Pipeline for PriceMyRental

Import pandas, and the get_data function from get_data.py that will retrieve and parse the data

In [20]:
import pandas as pd
import numpy as np
from get_data import get_data

Set the start and end dates that we want to pull data between, and call get_data, saving the result to a pandas dataframe. We set one_city==True so that we'll only retrieve data for San Francisco. This should take roughly 30 minutes. Set print_urls==True if you want to see the url of each JSON as it's being parsed. 

In [81]:
#These are commented out so they're not run inadvertently 
# start_date = date(2014, 9, 29)
# end_date = date(2015, 6, 17)
# get_data(start_date, end_date, one_city=True, print_urls=False)


In [None]:
# Load the csv saved by get_data
raw_df = pd.read_csv('data/USA-SFO-SNF_USA-CA.csv', header = False)

In [5]:
# Filter the data for listings from the City of San Francisco specifically
sfdf = raw_df[(raw_df['region']=='sfc')]

# Save a csv of this data for convenience
sfdf.to_csv('data/sf_raw.csv', index=False, encoding='utf-8')

In [2]:
# Loading the sf_raw csv  
sfdf = pd.read_csv('data/sf_raw.csv', header=False)

In [14]:
#Get list of top neighborhoods in San Francisco
nhoods = list((sfdf.neighborhood.value_counts()[:39]).keys())

In [15]:
#Remove erroneous neighborhood labels
nhoods.remove('San Francisco')
nhoods.remove('San Francisco, CA')
nhoods.remove('all neighborhoods')

In [16]:
nhoods

['SOMA / south beach',
 'pacific heights',
 'mission district',
 'nob hill',
 'downtown / civic / van ness',
 'marina / cow hollow',
 'lower nob hill',
 'russian hill',
 'ingleside / SFSU / CCSF',
 'sunset / parkside',
 'lower pac hts',
 'castro / upper market',
 'richmond / seacliff',
 'noe valley',
 'inner sunset / UCSF',
 'potrero hill',
 'inner richmond',
 'north beach / telegraph hill',
 'financial district',
 'hayes valley',
 'tenderloin',
 'alamo square / nopa',
 'bernal heights',
 'laurel hts / presidio',
 'glen park',
 'twin peaks / diamond hts',
 'excelsior / outer mission',
 'cole valley / ashbury hts',
 'USF / panhandle',
 'bayview',
 'haight ashbury',
 'west portal / forest hill',
 'portola district',
 'lower haight',
 'western addition',
 'visitacion valley']

In [22]:
condition = sfdf['neighborhood'].isin(nhoods)

In [23]:
condition

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
...
208572     True
208573     True
208574     True
208575     True
208576     True
208577     True
208578     True
208579     True
208580     True
208581    False
208582     True
208583     True
208584     True
208585     True
208586     True
Name: neighborhood, Length: 208587, dtype: bool

In [24]:
sfdf = sfdf[condition]

In [62]:
sf_deduped = sfdf.copy()
sf_deduped = sf_deduped.sort('date')
sf_deduped = sf_deduped.drop_duplicates('id')
sf_deduped = sf_deduped.drop_duplicates('body')

In [63]:
len(sf_deduped)

81087

In [65]:
sf_deduped.head()

Unnamed: 0,baths,beds,body,date,heading,id,lat,long,neighborhood,parking,price,region,washer_dryer
0,1.0,1,"\n We need to move out, but you can take over ...",2014-09-29 04:26:15,1 Bedroom Den Sublet Below Market Rate,1417995376,37.766746,-122.420654,mission district,4,3522,sfc,1
10,,0,\n **Open House on Friday 10/3 3pm to 5pm **\n...,2014-09-29 04:40:43,SRO Type Room In Chinatown,1418103620,37.7954,-122.406086,financial district,1,675,sfc,0
2,2.0,1,\n A spacious room on the top floor in a quiet...,2014-09-29 04:43:03,"Spacious furnished room, safe neighborhood, AV...",1418011844,37.775086,-122.444593,USF / panhandle,0,1325,sfc,0
1,3.0,2,\n We are renting our single family contempora...,2014-09-29 04:46:05,$4400 / 2br - 1700ft - Noe/Glen Park 2BR/3BA C...,1417995432,37.74128,-122.431662,noe valley,1,4400,sfc,1
9,1.0,2,\n Elegant and Charming Two Bedroom Corner Uni...,2014-09-29 04:50:50,Elegant and Charming Two Bedroom Corner Unit,1418073533,37.782434,-122.477345,richmond / seacliff,4,3700,sfc,1


In [97]:
sf_deduped.sort('price', ascending=False)

Unnamed: 0,baths,beds,body,date,heading,id,lat,long,neighborhood,parking,price,region,washer_dryer
53980,2,3,"\n $6000 per month rent available Feb 1, 2015....",2014-12-21 01:06:36,3br 2ba flat in 4 unit Edwardian bldg,1633289837,37.790879,-122.413852,nob hill,1,60000000,sfc,1
193495,1,1,\n Lease take over!\nUnfurnished 1bd 1.5 bath ...,2015-05-24 16:05:33,"Sublet and lease take over Vara 1bd den, vau...",2181097107,37.766627,-122.420547,mission district,4,36004100,sfc,1
72887,1,1,\n This cozy townhome can be yours today! This...,2015-01-21 00:25:50,Charming 2 Bedroom 1 Bath Townhome Apartment n...,1742634601,37.715739,-122.481712,ingleside / SFSU / CCSF,0,25993300,sfc,0
34947,1,2,\n This is a fantastic unit.\n2 beds (Very sma...,2014-11-20 22:33:16,beautiful 2 bed/bath with yard utility cover,1524116046,37.717546,-122.403750,portola district,4,17001800,sfc,0
21290,1,0,"\n $1600 and 1650\nSeparate bath and kitchen, ...",2014-11-01 00:18:22,well located studio,1484395952,37.785368,-122.416685,downtown / civic / van ness,1,16001600,sfc,0
17091,1,0,"\n $1600 and 1650\nSeparate bath and kitchen, ...",2014-10-24 23:57:24,well located studio,1475580185,37.785368,-122.416685,downtown / civic / van ness,1,16001600,sfc,0
60636,2,4,"\n A safe, very quiet, nice, friendly &amp; pr...",2015-01-03 06:12:52,amazing lovely home to reach your dream,1676761615,37.754430,-122.446800,twin peaks / diamond hts,1,14981600,sfc,0
126670,1,3,\n The place is between Mason and Clay. GREAT ...,2015-02-22 02:05:30,Two rooms available to sign a lease,1849373421,37.793747,-122.410365,nob hill,1,14501200,sfc,1
147433,1,3,The place is between Mason and Clay. GREAT LOC...,2015-02-24 21:57:46,Two rooms available to sign a lease,1858824772,37.793747,-122.410365,nob hill,1,14501200,sfc,1
172347,3,4,\n Newly Remodel top unit with over 1600 sqft....,2015-04-24 20:11:18,"Top Unit Newly Remodeled 4 bed, 3 bath renting...",2070216670,37.716553,-122.400141,visitacion valley,1,11001500,sfc,0


In [113]:
sf_deduped = sf_deduped[(sf_deduped['price']<20000)]

In [114]:
df_grouped = sf_deduped[(sf_deduped['beds']==1)].groupby('neighborhood').mean()

In [115]:
df_grouped

Unnamed: 0_level_0,baths,beds,id,lat,long,parking,price,washer_dryer
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SOMA / south beach,1.044012,1,1838042000.0,37.779185,-122.398773,1.861633,3530.412312,0.624512
USF / panhandle,1.011142,1,1772204000.0,37.776764,-122.44734,1.277628,2776.730458,0.088949
alamo square / nopa,1.0,1,1812892000.0,37.776124,-122.437073,1.573883,2785.35567,0.140893
bayview,1.006826,1,1802434000.0,37.724423,-122.390984,0.680352,2108.85044,0.519062
bernal heights,1.0,1,1785575000.0,37.741872,-122.417658,1.017804,2617.964243,0.133531
castro / upper market,1.006329,1,1804836000.0,37.761942,-122.4338,1.527981,3283.437956,0.301703
cole valley / ashbury hts,1.003356,1,1785752000.0,37.765437,-122.450181,1.359223,2877.763754,0.110032
downtown / civic / van ness,1.004046,1,1829226000.0,37.782167,-122.418239,1.96081,3163.279556,0.494448
excelsior / outer mission,1.029032,1,1793932000.0,37.72258,-122.436129,1.346821,1723.867052,0.138728
financial district,1.022646,1,1831249000.0,37.790513,-122.399628,0.886926,3416.300353,0.492344


In [117]:
sf_deduped.set_index('neighborhood', inplace=True)

In [123]:
sf_deduped.join(df_grouped[['price']], rsuffix='_1bdavg')

Unnamed: 0_level_0,baths,beds,body,date,heading,id,lat,long,parking,price,region,washer_dryer,price_1bdavg
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
SOMA / south beach,2,2,\n Millennium Towers is the Pinnacle of Luxury...,2014-09-29 05:51:39,MILLENNIUM Grand Residence w beautiful views o...,1418064527,37.780670,-122.388140,0,12500,sfc,1,3530.412312
SOMA / south beach,1,1,"\n Property Address:\n400 Beale Street, Unit #...",2014-09-29 06:32:29,Rincon Hill Highrise with Outdoor Patio Space,1418120828,37.786776,-122.390721,4,3500,sfc,1,3530.412312
SOMA / south beach,1,0,\n Open House Sunday 3:00-3:20\nJust Renovated...,2014-09-29 14:22:43,Just Renovated. Small Studio. Across St from T...,1418876409,37.776789,-122.415409,1,1625,sfc,0,3530.412312
SOMA / south beach,2,2,\n Contact info: Mark Anthony Venegas | SF Dre...,2014-09-29 15:05:16,Ready Today 09.29.14- Gorgeous 2BD/2BA Valet...,1419012743,37.791153,-122.395813,0,6950,sfc,1,3530.412312
SOMA / south beach,2,2,\n The Infinity! Spectacular City Views from t...,2014-09-29 15:45:57,The Infinity 301 Main St. 18H City View - Spin...,1419196022,37.789369,-122.391858,0,5700,sfc,0,3530.412312
SOMA / south beach,1,1,"\n 1 bed, 1 bath - FURNISHED Condo @ Arterra -...",2014-09-29 15:46:23,"1 bed, 1 bath - FURNISHED Condo Arterra - Prk...",1419237475,37.774031,-122.395577,4,3750,sfc,1,3530.412312
SOMA / south beach,1,1,\n Dogpatch LIVE/WORK LOFT - Direct Street Acc...,2014-09-29 15:49:54,Dogpatch LIVE/WORK LOFT - Direct Street Access...,1419159386,37.752481,-122.390666,4,3450,sfc,1,3530.412312
SOMA / south beach,1,1,"\n 1 Bed, 1.5 Bath - LIVE/WORK LOFT + 1 Car Pr...",2014-09-29 16:00:05,"1 Bed, 1.5 Bath - LIVE/WORK LOFT 1 Car Prkg ...",1419199508,37.780670,-122.388140,4,3950,sfc,1,3530.412312
SOMA / south beach,2,2,"\n 2 Bed, 2 Bath - Luxury SoMa Condo @ 140 Sou...",2014-09-29 16:02:47,"2 Bed, 2 Bath - Luxury SoMa Condo 140 South V...",1419533595,37.772449,-122.418425,4,3895,sfc,1,3530.412312
SOMA / south beach,2,2,\n Chic Mission Bay 2BR/2BA Condo w/Balcony! P...,2014-09-29 16:06:22,Chic Mission Bay 2BR/2BA Condo w/Balcony Pet O...,1419284474,37.780670,-122.388140,4,4500,sfc,1,3530.412312
