## Get house listings in NYC through Craiglist HTML

In [1]:
import requests
import urllib
from bs4 import BeautifulSoup

In [2]:
city='New York City'
url='https://newyork.craigslist.org/search/hhh?sort=rel&query={}'.format(city.replace(' ','+'))
url

'https://newyork.craigslist.org/search/hhh?sort=rel&query=New+York+City'

In [None]:
https://newyork.craigslist.org/search/rea?query=long+island+city+apartment&availabilityMode=0&sale_date=all+dates

### Use requests to call the html webpage, and beautifulsoup to parse the text

In [3]:
source=requests.get(url).text
soup=BeautifulSoup(source,'lxml')
print(soup.prettify())

<html>
 <body>
  <p>
   ﻿
   <!DOCTYPE html>
  </p>
  <title>
   new york housing "New York City" - craigslist
  </title>
  <meta content='new york housing "New York City" - craigslist' name="description"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <link href="https://newyork.craigslist.org/search/hhh" rel="canonical"/>
  <link href="https://newyork.craigslist.org/search/hhh?format=rss&amp;query=New%20York%20City&amp;sort=rel" rel="alternate" title='RSS feed for craigslist | new york housing "New York City" - craigslist' type="application/rss+xml"/>
  <link href="https://newyork.craigslist.org/search/hhh?s=120&amp;query=New%20York%20City&amp;sort=rel" rel="next"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <link href="//www.craigslist.org/styles/cl.css?v=38ee99f34dcd77a615cd2d3c32114559" media="all" rel="stylesheet" type="text/css"/>
  <link href="//www.craigslist.org/styles/search.css?v=84cf86bc094026e12fa066bbbab154ac" media="all" rel="s

#### check a single listing

In [4]:
listings=soup.find('li',class_='result-row')

In [5]:
print(listings.prettify())

<li class="result-row" data-pid="6640183142" data-repost-of="4723117692">
 <a class="result-image gallery" data-ids="1:00m0m_M22co7Crkb,1:00808_c6Uc0vzB4pZ,1:00k0k_dif4AM2ez0B,1:00D0D_2AvPMhidzaI,1:00303_eCo6bYe5G2W,1:00b0b_g64EAtIXGja,1:01313_aK46xxMBHx,1:00H0H_gLzUJj4LXpQ" href="https://newyork.craigslist.org/mnh/abo/d/real-2-bds-1baths-wd-in-the/6640183142.html">
  <span class="result-price">
   $4600
  </span>
 </a>
 <p class="result-info">
  <span class="icon icon-star" role="button">
   <span class="screen-reader-text">
    favorite this post
   </span>
  </span>
  <time class="result-date" datetime="2018-07-31 19:18" title="Tue 31 Jul 07:18:29 PM">
   Jul 31
  </time>
  <a class="result-title hdrlnk" data-id="6640183142" href="https://newyork.craigslist.org/mnh/abo/d/real-2-bds-1baths-wd-in-the/6640183142.html">
   ► REAL 2 BDS/ 1BATHS  ___ W.D IN THE UNIT ____ 3RD AVE   ___NO FEE
  </a>
  <span class="result-meta">
   <span class="result-price">
    $4600
   </span>
   <span cl

In [6]:
price=listings.find('span',class_='result-price')
price.text

'$4600'

In [8]:
#listings.find('a',class_="result-image gallery")
link=listings.a['href']
#link=link.split(' ')
link

'https://newyork.craigslist.org/mnh/abo/d/real-2-bds-1baths-wd-in-the/6640183142.html'

In [9]:
description=listings.find('span',class_='housing')
description.text.strip().replace(' ','').replace('-\n',' ').replace('-','')

'2br'

In [10]:
title=listings.find('a',class_="result-title hdrlnk").text
title

'► REAL 2 BDS/ 1BATHS  ___ W.D IN THE UNIT ____ 3RD AVE   ___NO FEE'

In [12]:
post_time=listings.time.text
post_time

'Jul 31'

### Create a loop to get all the price, description, link and post time of the house listings

In [19]:
import numpy as np
price=[]
website=[]
description=[]
title=[]
post_time=[]

city=['New York City','White Plains','Queens','Yonkers']
for city in city:
    #url='https://newyork.craigslist.org/search/rea?query=long+island+city+apartment&availabilityMode=0&sale_date=all+dates
    url='https://newyork.craigslist.org/search/hhh?sort=rel&query={}'.format(city.replace(' ','+'))
    source=requests.get(url).text
    soup=BeautifulSoup(source,'lxml')
    for listings in soup.find_all('li',class_='result-row'):
        #price.append(listings.find('span',class_='result-price').text)
        if listings.find('span',class_='result-price') is not None:
            price.append(listings.find('span',class_='result-price').text)
        else:
            price.append(np.NaN)
        if listings.find('span',class_='result-price') is not None:
            title.append(listings.find('a',class_="result-title hdrlnk").text)
        else:
            title.append(np.NaN)
        website.append(listings.a['href'])
        desc=listings.find('span',class_='housing')
        if desc is not None:
            description.append(desc.text.strip().replace(' ','').replace('-\n',' ').replace('-',''))
        else:
            description.append(np.NaN)
        post_time.append(listings.time.text)


### output to a pandas dataframe

In [20]:
import pandas as pd
pd.options.display.max_colwidth=1000 # to display full url link
#pd.set_option('display.width', 1000)
dataframe=pd.DataFrame({'description':description, 'website':website, 'price':price, 'post_time':post_time,'Title':title})

In [21]:
dataframe

Unnamed: 0,Title,description,post_time,price,website
0,"Private Offices - NEW, Clean, Fully Serviced, Furnished, AFFORDABLE",200ft2,Jul 31,$650,https://newyork.craigslist.org/mnh/off/d/private-offices-new-clean/6648681600.html
1,Victorian T-House on upper 5th Ave. gutted*CAIR*WD*BALC*FP* E+W TRAIN,2br,Jul 31,$2995,https://newyork.craigslist.org/mnh/abo/d/victorian-house-on-upper-5th/6658650735.html
2,Queen-sized bedroom in newly renovated 2br/1 bath in Lincoln Square,650ft2,Jul 31,$2057,https://newyork.craigslist.org/mnh/roo/d/queen-sized-bedroom-in-newly/6658741162.html
3,,,Jul 31,,https://newyork.craigslist.org/mnh/prk/d/blow/6658663912.html
4,Furnished Upper West Side 1 Bedroom Apt in Luxury Building,1br 800ft2,Jul 31,$4500,https://newyork.craigslist.org/mnh/sub/d/furnished-upper-west-side-1/6658739832.html
5,"n,,1 br apt located on corner of 105 st and Amsterdam ave,laundry",1br,Jul 31,$2399,https://newyork.craigslist.org/mnh/reo/d/n1-br-apt-located-on-corner/6658740808.html
6,Looking for an enjoyable roommate for single bedroom,,Jul 31,$1293,https://newyork.craigslist.org/que/roo/d/looking-for-an-enjoyable/6658694848.html
7,*NO FEE* Real Three Bedroom-Doorman-GYM-Laundry-Prime Fidi,,Jul 31,$4100,https://newyork.craigslist.org/mnh/abo/d/no-fee-real-three-bedroom/6658709603.html
8,NO FEE Attended Doorman Located on the Corner of Madison Ave,560ft2,Jul 31,$2250,https://newyork.craigslist.org/mnh/off/d/no-fee-attended-doorman/6658738413.html
9,Windows Throughout Combined Units Madison & 33 st NO FEE,560ft2,Jul 31,$2250,https://newyork.craigslist.org/mnh/off/d/windows-throughout-combined/6658738679.html


In [22]:
dataframe['website']

0           https://newyork.craigslist.org/mnh/off/d/private-offices-new-clean/6648681600.html
1        https://newyork.craigslist.org/mnh/abo/d/victorian-house-on-upper-5th/6658650735.html
2        https://newyork.craigslist.org/mnh/roo/d/queen-sized-bedroom-in-newly/6658741162.html
3                                https://newyork.craigslist.org/mnh/prk/d/blow/6658663912.html
4         https://newyork.craigslist.org/mnh/sub/d/furnished-upper-west-side-1/6658739832.html
5         https://newyork.craigslist.org/mnh/reo/d/n1-br-apt-located-on-corner/6658740808.html
6            https://newyork.craigslist.org/que/roo/d/looking-for-an-enjoyable/6658694848.html
7           https://newyork.craigslist.org/mnh/abo/d/no-fee-real-three-bedroom/6658709603.html
8             https://newyork.craigslist.org/mnh/off/d/no-fee-attended-doorman/6658738413.html
9         https://newyork.craigslist.org/mnh/off/d/windows-throughout-combined/6658738679.html
10       https://newyork.craigslist.org/mnh/off/d/