In [1]:
import numpy as np 
import pandas as pd 
import requests 
import matplotlib.pyplot as plt 
from tqdm import tqdm
%matplotlib inline 

In [2]:
r = requests.get('https://www.renthop.com/listings/e-39th-st/19c/14892083') 

In [3]:
from bs4 import BeautifulSoup 

In [4]:
soup = BeautifulSoup(r.content, "html.parser") 

In [5]:
listing_divs = soup.select('div[class*=search-info]')

In [6]:
len(listing_divs)

13

In [7]:
def parse_data(listing_divs):
    listing_list = [] 
    for idx in range(len(listing_divs)):
        indv_listing = []
        current_listing = listing_divs[idx]
        href = current_listing.select('a[id*=title]')[0]['href'] 
        addy = current_listing.select('a[id*=title]')[0].string 
        hood = current_listing.select('div[id*=hood]')[0].string.replace('\n','') 
        indv_listing.append(href)
        indv_listing.append(addy)
        indv_listing.append(hood)

        listing_specs = current_listing.select('table[id*=info] tr') 

        for spec in listing_specs:
            try: 
                #print(spec.text.replace(' ', '_').split())
                parts =spec.text.replace(' ', '').split()
                if len(parts)==4:
                    parts[3],parts[2] =parts[2],parts[3]
                #print(parts)
                indv_listing.extend(parts) 
            except: 
                indv_listing.extend(np.nan) 
        listing_list.append(indv_listing)  
    return listing_list

In [8]:
listing_list = parse_data(listing_divs)

In [9]:
url_prefix = "https://www.renthop.com/search/nyc?max_price=50000&min_price=0&page=" 
page_no = 1 
url_suffix = "&sort=hopscore&q=&search=0" 

In [11]:
all_pages_parsed = [] 
for i in tqdm(range(1,101)): 
    page_no = i 
    target_page = url_prefix + str(page_no) + url_suffix 
     
    r = requests.get(target_page) 
     
    soup = BeautifulSoup(r.content, 'html.parser') 
     
    listing_divs = soup.select('div[class*=search-info]') 
     
    one_page_parsed = parse_data(listing_divs) 
     
    all_pages_parsed.extend(one_page_parsed) 

    

100%|█████████████████████████████████████████████████████████████████████████████████| 100/100 [02:29<00:00,  1.46s/it]


In [12]:
all_pages_parsed[0]

['https://www.renthop.com/listings/332-e-109th-street/5/15169991',
 '331 E 109th St, Apt 5',
 'East Harlem, Upper Manhattan, Manhattan',
 '$2,250',
 '2Bed',
 '1Bath']

In [13]:
rent_listing_df  = pd.DataFrame(all_pages_parsed,columns=['url', 'address', 'neighborhood', 'rent','bed','baths','Additional'])
rent_listing_df.head()

Unnamed: 0,url,address,neighborhood,rent,bed,baths,Additional
0,https://www.renthop.com/listings/332-e-109th-s...,"331 E 109th St, Apt 5","East Harlem, Upper Manhattan, Manhattan","$2,250",2Bed,1Bath,
1,https://www.renthop.com/listings/74-macdonough...,"74 Macdonough St, Apt 4","Bedford-Stuyvesant, Northern Brooklyn, Brooklyn","$2,750",2Bed,2Bath,
2,https://www.renthop.com/listings/167-west-10th...,"167 West 10th Street, Apt TOWN...","West Village, Downtown Manhattan, Manhattan","$9,995",3Bed,2Bath,
3,https://www.renthop.com/listings/w-191-street/...,W 191 Street,"Fort George, Washington Heights, Upper Manhatt...","$1,568",1Bed,1Bath,
4,https://www.renthop.com/listings/1385-york-ave...,"1385 York Ave, Apt G","Upper East Side, Upper Manhattan, Manhattan","$6,995",2Bed,3Bath,/Flex3


In [14]:
rent_listing_df[rent_listing_df['Additional'].notnull()].head()

Unnamed: 0,url,address,neighborhood,rent,bed,baths,Additional
4,https://www.renthop.com/listings/1385-york-ave...,"1385 York Ave, Apt G","Upper East Side, Upper Manhattan, Manhattan","$6,995",2Bed,3Bath,/Flex3
11,https://www.renthop.com/listings/82-76-116th-s...,"82-76 116th Street, Kew Garden...","Kew Gardens, Northeastern Queens, Queens","$2,756",2Bed,2Bath,/Flex3
24,https://www.renthop.com/listings/82-76-116th-s...,"82-76 116th Street, Kew Garden...","Kew Gardens, Northeastern Queens, Queens","$2,756",2Bed,2Bath,/Flex3
75,https://www.renthop.com/listings/550-west-140t...,"550 West 140th Street, Apt 2","Hamilton Heights, West Harlem, Upper Manhattan...","$1,984",Studio,1Bath,/Flex1
85,https://www.renthop.com/listings/550-west-140t...,"550 West 140th Street, Apt 2","Hamilton Heights, West Harlem, Upper Manhattan...","$1,984",Studio,1Bath,/Flex1


In [15]:
rent_listing_df['bed'].value_counts()

1Bed      688
2Bed      591
Studio    322
3Bed      281
4Bed       87
Room       13
5Bed       10
Loft        6
7Bed        1
6Bed        1
Name: bed, dtype: int64

In [16]:
rent_listing_df['baths'].value_counts()

1Bath      1485
2Bath       424
3Bath        53
1.5Bath      18
2.5Bath      10
4Bath         6
7Bath         2
5Bath         1
3.5Bath       1
Name: baths, dtype: int64

In [17]:
rent_listing_df['rent'] = rent_listing_df['rent'].map(lambda x: str(x).replace('$','').replace(',','')).astype('int')

In [18]:
rent_listing_df['bed'] = rent_listing_df['bed'].map(lambda x: x.replace('Bed',''))
rent_listing_df['bed'] = rent_listing_df['bed'].map(lambda x: x.replace('Studio','0'))
rent_listing_df['bed'] = rent_listing_df['bed'].map(lambda x: x.replace('Room','0'))
rent_listing_df['bed'] = rent_listing_df['bed'].map(lambda x: x.replace('Loft','0')).astype(int)
 

In [19]:
rent_listing_df['baths'] = rent_listing_df['baths'].map(lambda x: x.replace('Bath','')).astype(float)

In [20]:
rent_listing_df.describe()

Unnamed: 0,rent,bed,baths
count,2000.0,2000.0,2000.0
mean,4073.345,1.562,1.29525
std,2423.338656,1.102163,0.564457
min,880.0,0.0,1.0
25%,2750.0,1.0,1.0
50%,3543.0,1.0,1.0
75%,4727.5,2.0,1.5
max,49500.0,7.0,7.0


In [21]:
rent_listing_df['neighborhood'] = rent_listing_df['neighborhood'].map(lambda x: x.strip()) 

In [22]:
rent_listing_df.groupby('neighborhood')['rent'].count().to_frame('count').sort_values(by='count', ascending=False) 

Unnamed: 0_level_0,count
neighborhood,Unnamed: 1_level_1
"Hell's Kitchen, Midtown Manhattan, Manhattan",155
"Upper East Side, Upper Manhattan, Manhattan",108
"Yorkville, Upper East Side, Upper Manhattan, Manhattan",100
"Murray Hill, Midtown Manhattan, Manhattan",96
"Financial District, Downtown Manhattan, Manhattan",93
"Lincoln Square, Upper West Side, Upper Manhattan, Manhattan",75
"Bedford-Stuyvesant, Northern Brooklyn, Brooklyn",66
"Upper West Side, Upper Manhattan, Manhattan",64
"West Village, Downtown Manhattan, Manhattan",56
"East Village, Downtown Manhattan, Manhattan",51


In [23]:
rent_listing_df.groupby('neighborhood')['rent'].mean().to_frame('mean').sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,mean
neighborhood,Unnamed: 1_level_1
"Lenox Hill, Upper East Side, Upper Manhattan, Manhattan",14310.714286
"Hudson Square, SoHo, Downtown Manhattan, Manhattan",9550.000000
"Midtown East, Midtown Manhattan, Manhattan",7914.400000
"Flatiron District, Midtown Manhattan, Manhattan",7273.400000
"Central Park, Upper Manhattan, Manhattan",6590.000000
"Lincoln Square, Upper West Side, Upper Manhattan, Manhattan",6468.333333
"Battery Park City, Downtown Manhattan, Manhattan",6278.583333
"NoMad, Midtown Manhattan, Manhattan",6185.375000
"SoHo, Downtown Manhattan, Manhattan",6085.833333
"Lower East Side, Downtown Manhattan, Manhattan",5982.550000


In [24]:
from geopy import ArcGIS

In [27]:
nom = ArcGIS()

In [30]:
from geopy.exc import GeocoderTimedOut

def do_geocode(address):
    try:
        return nom.geocode(address)
    except GeocoderTimedOut:
        time.sleep(1.25)
        return do_geocode(address)

In [31]:
rent_listing_df['location'] = rent_listing_df['neighborhood'].apply(do_geocode)

KeyboardInterrupt: 

In [25]:
rent_listing_df['latitude'] = rent_listing_df['location'].apply(lambda x : x.latitude if x else None)

AttributeError: 'str' object has no attribute 'latitude'

In [None]:
rent_listing_df['longitude'] = rent_listing_df['location'].apply(lambda x : x.longitude if x else None)