### Get raw data from craiglist's website
###### By: Nick and Omer

In [None]:
import requests

In [2]:
craigslist_request = requests.get('https://newyork.craigslist.org/search/brk/aap')

In [3]:
craigslist_request

<Response [200]>

### Look what a wonderful piece we got here

In [4]:
craigslist_request.content

b'\xef\xbb\xbf<!DOCTYPE html>\n<html class="no-js"><head>\n    <title>new york all apartments  - craigslist</title>\n\n    <meta name="description" content="new york all apartments  - craigslist">\n    <meta http-equiv="X-UA-Compatible" content="IE=Edge"/>\n    <link rel="canonical" href="https://newyork.craigslist.org/search/brk/aap">\n    <link rel="alternate" type="application/rss+xml" href="https://newyork.craigslist.org/search/brk/aap?format=rss" title="RSS feed for craigslist | new york all apartments  - craigslist">\n        <link rel="next" href="https://newyork.craigslist.org/search/brk/aap?s=120">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <link type="text/css" rel="stylesheet" media="all" href="//www.craigslist.org/styles/cl.css?v=281764e2707bd58e05233e4b7df36df8">\n    <link type="text/css" rel="stylesheet" media="all" href="//www.craigslist.org/styles/search.css?v=2d972697128e23b83898e73d71ad0d39">\n    <link type="text/css" rel="styleshe

###  Now let's look what beautiful soup helps us with

In [5]:
from bs4 import BeautifulSoup

In [6]:
craigslist_soup = BeautifulSoup(craigslist_request.text, 'lxml')

In [7]:
craigslist_soup

<html><body><p>﻿<!DOCTYPE html>

</p>
<title>new york all apartments  - craigslist</title>
<meta content="new york all apartments  - craigslist" name="description"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<link href="https://newyork.craigslist.org/search/brk/aap" rel="canonical"/>
<link href="https://newyork.craigslist.org/search/brk/aap?format=rss" rel="alternate" title="RSS feed for craigslist | new york all apartments  - craigslist" type="application/rss+xml"/>
<link href="https://newyork.craigslist.org/search/brk/aap?s=120" rel="next"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="//www.craigslist.org/styles/cl.css?v=281764e2707bd58e05233e4b7df36df8" media="all" rel="stylesheet" type="text/css"/>
<link href="//www.craigslist.org/styles/search.css?v=2d972697128e23b83898e73d71ad0d39" media="all" rel="stylesheet" type="text/css"/>
<link href="//www.craigslist.org/styles/jquery-ui-clcustom.css?v=3b05ddffb7c7f5b62066deff2dda9339" media

### What's the gain? 

 It's now human readable html code.
 
 But that's not all - beautiful soup has many methods to easily access the html code and grab the data we are after

 The idea is to build a dictionary containing details 
 about different apartments so that later we can use it as a 
 data frame for analysis. id will help us to uniquely 
 identify apartments (dict key) and add info to that key.


So firstly, we are interested in the id of an apartment

We will use beautiful soup to find the associated html tags with ID 

Where can we find that? 

1. go to website 
2. open developer console 
3. inspect the elements that you are targeting, and identify the structure leading there and the tags / attributes of those elements

In [8]:
apartments = craigslist_soup.findAll('li', {'class':"result-row"})

In [9]:
apartments

[<li class="result-row" data-pid="6890894440">
 <a class="result-image gallery" data-ids="1:01717_ksQo2cuASlJ,1:00M0M_9WEMnM2Rj6W,1:00q0q_bVTm8OV0B2s,1:00q0q_dIfEBSl33mW,1:00707_7YGlM2TUZLV,1:00T0T_6jBc1d3Rsus" href="https://newyork.craigslist.org/brk/abo/d/brooklyn-supper-sunny-3br-2bath-wd-in/6890894440.html">
 <span class="result-price">$2796</span>
 </a>
 <p class="result-info">
 <span class="icon icon-star" role="button">
 <span class="screen-reader-text">favorite this post</span>
 </span>
 <time class="result-date" datetime="2019-05-17 10:39" title="Fri 17 May 10:39:15 AM">May 17</time>
 <a class="result-title hdrlnk" data-id="6890894440" href="https://newyork.craigslist.org/brk/abo/d/brooklyn-supper-sunny-3br-2bath-wd-in/6890894440.html">*SUPPER SUNNY 3BR 2BATH W\D IN UNIT*BIG PVT DECK*NOFEE*BROWNSTONE*DEAL</a>
 <span class="result-meta">
 <span class="result-price">$2796</span>
 <span class="housing">
                     3br -
                 </span>
 <span class="result-hood

In [10]:
len(apartments)

120

In [11]:
apartments[0].find('span', attrs = {'class' : 'result-price'})

<span class="result-price">$2796</span>

In [12]:
apartments[0].find('span', attrs = {'class' : 'result-price'}).text

'$2796'

### top level html attributes can be indexed directly in BeautifulSoup

In [13]:
apartments[0]

<li class="result-row" data-pid="6890894440">
<a class="result-image gallery" data-ids="1:01717_ksQo2cuASlJ,1:00M0M_9WEMnM2Rj6W,1:00q0q_bVTm8OV0B2s,1:00q0q_dIfEBSl33mW,1:00707_7YGlM2TUZLV,1:00T0T_6jBc1d3Rsus" href="https://newyork.craigslist.org/brk/abo/d/brooklyn-supper-sunny-3br-2bath-wd-in/6890894440.html">
<span class="result-price">$2796</span>
</a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2019-05-17 10:39" title="Fri 17 May 10:39:15 AM">May 17</time>
<a class="result-title hdrlnk" data-id="6890894440" href="https://newyork.craigslist.org/brk/abo/d/brooklyn-supper-sunny-3br-2bath-wd-in/6890894440.html">*SUPPER SUNNY 3BR 2BATH W\D IN UNIT*BIG PVT DECK*NOFEE*BROWNSTONE*DEAL</a>
<span class="result-meta">
<span class="result-price">$2796</span>
<span class="housing">
                    3br -
                </span>
<span class="result-hood"> (BUSHWICK)</s

In [14]:
apartments[0]['data-pid']

'6890894440'

Now that we have the unique listing id per apartment (data-pid), we can write a loop to grab them for every apartment

In [15]:
for apartment in apartments:
    print(apartment['data-pid'])

6890894440
6890886789
6890867351
6890894334
6890889936
6890891706
6890896934
6890928405
6878958987
6890875307
6890863402
6884186947
6889041188
6889044883
6890860045
6890894905
6890856200
6890858074
6890867488
6890866766
6890887723
6888549988
6888549531
6888549595
6888549619
6888549808
6886036317
6890857219
6890857806
6890847979
6890859215
6886893614
6890861549
6890858273
6890916059
6890915824
6878453131
6890915539
6884518991
6878575121
6888824986
6890915234
6890871770
6884184086
6889038708
6883627418
6878447758
6890911816
6886500127
6890908149
6889095041
6889288118
6889281993
6889284269
6890869453
6890909538
6890895244
6890842069
6881923166
6890863567
6890854106
6890859919
6887675626
6890849223
6890842984
6879287138
6890856493
6890853095
6888082319
6890845799
6890894970
6890842504
6890894181
6890893536
6890852036
6890893149
6890891226
6890843258
6890845376
6885701068
6886965664
6886958821
6886961091
6885769614
6885779200
6890886521
6890844867
6879605981
6880889195
6879594671
6879899997

In [16]:
# apartments[0].find('span', {'class' : 'housing'}).text.strip()

### Let's loop through the properties that we want and add them to a dictionary so we can insert it in a DataFrame

In [17]:
apartment_dict = {}
col_names = []
for apartment in apartments:
    pid = apartment['data-pid']
    
    price = apartment.find('span', {'class' : 'result-price'}).text
    
    title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text
    
    apartment_dict[pid] = [title, price]
    
col_names.append(['title', 'price'])

In [18]:
apartment_dict

{'6890894440': ['*SUPPER SUNNY 3BR 2BATH W\\D IN UNIT*BIG PVT DECK*NOFEE*BROWNSTONE*DEAL',
  '$2796'],
 '6890886789': ['No Fee 2BR - 1 Stop to the City! Super Nice Bldg!', '$2750'],
 '6890867351': ['~~^1 Bedroom BUSHWICK Spacious APT~~ Must See!', '$2000'],
 '6890894334': ['Newly renovated Large 3 Bedroom w/ Open kitchen design',
  '$2500'],
 '6890889936': ['Very convenient location. Lots of character and sunlight!',
  '$2700'],
 '6890891706': ['Woohoo! Great 1BR Available! W/D on site, dishwasher, Storage!',
  '$1950'],
 '6890896934': ['Lovely 4bed 1.5ba wPRIV YARD>Brand new reno>Clean>Bright w/LAUNDRY',
  '$4500'],
 '6890928405': ['BROOKLYN HEIGHTS Luxury Penthouse- 2BR - Private rooftop - No FEE',
  '$6190'],
 '6878958987': ['1 BR-$1675 Immediately Available', '$1675'],
 '6890875307': ['3 Bedrooms, 1.5 Bathroom Duplex', '$11385'],
 '6890863402': ['~~4 Bedroom 1.5 Bath Ridgewood Luxury APT~', '$3600'],
 '6884186947': ['New reno/2 bd/Heat included/ Priv Backyard /G train!',
  '$2090']

In [19]:
import pandas as pd

In [20]:
pd.DataFrame(apartment_dict).T

Unnamed: 0,0,1
6890894440,*SUPPER SUNNY 3BR 2BATH W\D IN UNIT*BIG PVT DE...,$2796
6890886789,No Fee 2BR - 1 Stop to the City! Super Nice Bldg!,$2750
6890867351,~~^1 Bedroom BUSHWICK Spacious APT~~ Must See!,$2000
6890894334,Newly renovated Large 3 Bedroom w/ Open kitche...,$2500
6890889936,Very convenient location. Lots of character an...,$2700
6890891706,"Woohoo! Great 1BR Available! W/D on site, dish...",$1950
6890896934,Lovely 4bed 1.5ba wPRIV YARD>Brand new reno>Cl...,$4500
6890928405,BROOKLYN HEIGHTS Luxury Penthouse- 2BR - Priva...,$6190
6878958987,1 BR-$1675 Immediately Available,$1675
6890875307,"3 Bedrooms, 1.5 Bathroom Duplex",$11385


### Let's look at what happens when we try to grab bedrooms? 

In [21]:
for apartment in apartments:
    print(apartment.find('span', {'class' : 'housing'}))

<span class="housing">
                    3br -
                </span>
<span class="housing">
                    2br -
                </span>
<span class="housing">
                    1br -
                </span>
<span class="housing">
                    3br -
                    1200ft<sup>2</sup> -
                </span>
<span class="housing">
                    3br -
                </span>
<span class="housing">
                    1br -
                </span>
<span class="housing">
                    4br -
                </span>
<span class="housing">
                    2br -
                </span>
<span class="housing">
                    1br -
                </span>
None
<span class="housing">
                    4br -
                </span>
<span class="housing">
                    2br -
                </span>
<span class="housing">
                    2br -
                </span>
<span class="housing">
                    3br -
                </span>
<span

In [22]:
# We hit an error when an expected value is missing (enters as none, but None has no .text attribute)!

for apartment in apartments:
    print(apartment.find('span', {'class' : 'housing'}).text)


                    3br -
                

                    2br -
                

                    1br -
                

                    3br -
                    1200ft2 -
                

                    3br -
                

                    1br -
                

                    4br -
                

                    2br -
                

                    1br -
                


AttributeError: 'NoneType' object has no attribute 'text'

### How do we handle errors without throwing all useful info? 

In [23]:
apartment_dict = {}
col_names = []
for apartment in apartments:
    pid = apartment['data-pid']
    price = apartment.find('span', {'class' : 'result-price'}).text
    title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text
    
    try: 
        bdr = apartment.find('span', {'class' : 'housing'}).text.strip()
        
    except:
        bdr = 'None -'
    
    time_posted = apartment.find('time', {'class' : 'result-date'})['datetime']
    apartment_dict[pid] = [title, price, time_posted]
    
    apartment_dict[pid] = [title, price, bdr, time_posted]
col_names.append(['title', 'price', 'bdr', 'time_posted'])

In [24]:
import pandas as pd
### .T means transpose columns to rows!

apartment_df = pd.DataFrame(apartment_dict).T
apartment_df.columns = col_names

In [25]:
apartment_df = apartment_df.reset_index()

In [26]:
apartment_df

Unnamed: 0,index,title,price,bdr,time_posted
0,6890894440,*SUPPER SUNNY 3BR 2BATH W\D IN UNIT*BIG PVT DE...,$2796,3br -,2019-05-17 10:39
1,6890886789,No Fee 2BR - 1 Stop to the City! Super Nice Bldg!,$2750,2br -,2019-05-17 10:39
2,6890867351,~~^1 Bedroom BUSHWICK Spacious APT~~ Must See!,$2000,1br -,2019-05-17 10:37
3,6890894334,Newly renovated Large 3 Bedroom w/ Open kitche...,$2500,3br -\n 1200ft2 -,2019-05-17 10:37
4,6890889936,Very convenient location. Lots of character an...,$2700,3br -,2019-05-17 10:36
5,6890891706,"Woohoo! Great 1BR Available! W/D on site, dish...",$1950,1br -,2019-05-17 10:36
6,6890896934,Lovely 4bed 1.5ba wPRIV YARD>Brand new reno>Cl...,$4500,4br -,2019-05-17 10:34
7,6890928405,BROOKLYN HEIGHTS Luxury Penthouse- 2BR - Priva...,$6190,2br -,2019-05-17 10:30
8,6878958987,1 BR-$1675 Immediately Available,$1675,1br -,2019-05-17 10:26
9,6890875307,"3 Bedrooms, 1.5 Bathroom Duplex",$11385,None -,2019-05-17 10:24


### Function

In [27]:
craigslist_request = requests.get('https://newyork.craigslist.org/search/brk/aap')

In [28]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def cl_scrape(url):
        
    craigslist_request = requests.get(url)
    craigslist_soup = BeautifulSoup(craigslist_request.text)
    
    apartments =  craigslist_soup.findAll('li', {'class':"result-row"})
    
    apartment_dict = {}
    col_names = []
    
    for apartment in apartments:
        pid = apartment['data-pid']
        
        price = apartment.find('span', {'class' : 'result-price'}).text
        
        title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text

        
        try: 
            bdr = apartment.find('span', {'class' : 'housing'}).text.strip()

        except:
            bdr = 'None -'

        time_posted = apartment.find('time', {'class' : 'result-date'})['datetime']
        apartment_dict[pid] = [title, price, time_posted]

        apartment_dict[pid] = [title, price, bdr, time_posted]
    col_names.append(['title', 'price', 'bdr', 'time_posted'])
    
    apartment_df = pd.DataFrame(apartment_dict).T
    
    apartment_df.columns = col_names
    
    return apartment_df

In [29]:
df = cl_scrape('https://newyork.craigslist.org/search/brk/aap')

df

Unnamed: 0,title,price,bdr,time_posted
6890894440,*SUPPER SUNNY 3BR 2BATH W\D IN UNIT*BIG PVT DE...,$2796,3br -,2019-05-17 10:39
6890886789,No Fee 2BR - 1 Stop to the City! Super Nice Bldg!,$2750,2br -,2019-05-17 10:39
6890867351,~~^1 Bedroom BUSHWICK Spacious APT~~ Must See!,$2000,1br -,2019-05-17 10:37
6890894334,Newly renovated Large 3 Bedroom w/ Open kitche...,$2500,3br -\n 1200ft2 -,2019-05-17 10:37
6890889936,Very convenient location. Lots of character an...,$2700,3br -,2019-05-17 10:36
6890891706,"Woohoo! Great 1BR Available! W/D on site, dish...",$1950,1br -,2019-05-17 10:36
6890896934,Lovely 4bed 1.5ba wPRIV YARD>Brand new reno>Cl...,$4500,4br -,2019-05-17 10:34
6890928405,BROOKLYN HEIGHTS Luxury Penthouse- 2BR - Priva...,$6190,2br -,2019-05-17 10:30
6878958987,1 BR-$1675 Immediately Available,$1675,1br -,2019-05-17 10:26
6890875307,"3 Bedrooms, 1.5 Bathroom Duplex",$11385,None -,2019-05-17 10:24
