### Get raw data from craiglist's website
###### By: Nick and Omer

In [None]:
import requests

In [None]:
craigslist_request = requests.get('https://newyork.craigslist.org/search/brk/aap')

In [None]:
craigslist_request

### Look what a wonderful piece we got here

In [None]:
craigslist_request.content

###  Now let's look what beautiful soup helps us with

In [None]:
from bs4 import BeautifulSoup

In [None]:
craigslist_soup = BeautifulSoup(craigslist_request.text, 'lxml')

In [None]:
craigslist_soup

### What's the gain? 

 It's now human readable html code.
 
 But that's not all - beautiful soup has many methods to easily access the html code and grab the data we are after

 The idea is to build a dictionary containing details 
 about different apartments so that later we can use it as a 
 data frame for analysis. id will help us to uniquely 
 identify apartments (dict key) and add info to that key.


So firstly, we are interested in the id of an apartment

We will use beautiful soup to find the associated html tags with ID 

Where can we find that? 

1. go to website 
2. open developer console 
3. inspect the elements that you are targeting, and identify the structure leading there and the tags / attributes of those elements

In [None]:
apartments = craigslist_soup.findAll('li', {'class':"result-row"})

In [None]:
apartments

In [None]:
len(apartments)

In [None]:
apartments[0].find('span', attrs = {'class' : 'result-price'})

In [None]:
apartments[0].find('span', attrs = {'class' : 'result-price'}).text

### top level html attributes can be indexed directly in BeautifulSoup

In [None]:
apartments[0]

In [None]:
apartments[0]['data-pid']

Now that we have the unique listing id per apartment (data-pid), we can write a loop to grab them for every apartment

In [None]:
for apartment in apartments:
    print(apartment['data-pid'])

In [None]:
# apartments[0].find('span', {'class' : 'housing'}).text.strip()

### Let's loop through the properties that we want and add them to a dictionary so we can insert it in a DataFrame

In [None]:
apartment_dict = {}
col_names = []
for apartment in apartments:
    pid = apartment['data-pid']
    
    price = apartment.find('span', {'class' : 'result-price'}).text
    
    title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text
    
    apartment_dict[pid] = [title, price]
    
col_names.append(['title', 'price'])

In [None]:
apartment_dict

In [None]:
import pandas as pd

In [None]:
pd.DataFrame(apartment_dict).T

### Let's look at what happens when we try to grab bedrooms? 

In [None]:
for apartment in apartments:
    print(apartment.find('span', {'class' : 'housing'}))

In [None]:
# We hit an error when an expected value is missing (enters as none, but None has no .text attribute)!

for apartment in apartments:
    print(apartment.find('span', {'class' : 'housing'}).text)

### How do we handle errors without throwing all useful info? 

In [None]:
apartment_dict = {}
col_names = []
for apartment in apartments:
    pid = apartment['data-pid']
    price = apartment.find('span', {'class' : 'result-price'}).text
    title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text
    
    try: 
        bdr = apartment.find('span', {'class' : 'housing'}).text.strip()
        
    except:
        bdr = 'None -'
    
    time_posted = apartment.find('time', {'class' : 'result-date'})['datetime']
    apartment_dict[pid] = [title, price, time_posted]
    
    apartment_dict[pid] = [title, price, bdr, time_posted]
col_names.append(['title', 'price', 'bdr', 'time_posted'])

In [None]:
import pandas as pd
### .T means transpose columns to rows!

apartment_df = pd.DataFrame(apartment_dict).T
apartment_df.columns = col_names

In [None]:
apartment_df = apartment_df.reset_index()

In [None]:
apartment_df

### Function

In [None]:
craigslist_request = requests.get('https://newyork.craigslist.org/search/brk/aap')

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

def cl_scrape(url):
        
    craigslist_request = requests.get(url)
    craigslist_soup = BeautifulSoup(craigslist_request.text)
    
    apartments =  craigslist_soup.findAll('li', {'class':"result-row"})
    
    apartment_dict = {}
    col_names = []
    
    for apartment in apartments:
        pid = apartment['data-pid']
        
        price = apartment.find('span', {'class' : 'result-price'}).text
        
        title = apartment.find('a', {'class' : 'result-title hdrlnk'}).text

        
        try: 
            bdr = apartment.find('span', {'class' : 'housing'}).text.strip()

        except:
            bdr = 'None -'

        time_posted = apartment.find('time', {'class' : 'result-date'})['datetime']
        apartment_dict[pid] = [title, price, time_posted]

        apartment_dict[pid] = [title, price, bdr, time_posted]
    col_names.append(['title', 'price', 'bdr', 'time_posted'])
    
    apartment_df = pd.DataFrame(apartment_dict).T
    
    apartment_df.columns = col_names
    
    return apartment_df

In [None]:
df = cl_scrape('https://newyork.craigslist.org/search/brk/aap')

df