In [33]:
import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import json
from bs4 import BeautifulSoup as bs4
from IPython.display import display, HTML

sns.set_context('notebook', font_scale=1.6)
%matplotlib inline

In [20]:
def scrape_craiglist_page(page):
    url = 'http://chicago.craigslist.org/search/chc/apa'
    params = {'bedrooms': 2, 's': page}
    r = requests.get(url, params=params)
    html = bs4(r.text, 'html.parser')
    apts = html.find_all('p', attrs={'class': 'row'})
    return apts

    
def find_size_and_brs(size):
    split = size.strip('/- ').split(' - ')
    if len(split) == 2:
        n_brs = split[0].replace('br', '')
        this_size = split[1].replace('ft2', '')
    elif 'br' in split[0]:
        # It's the n_bedrooms
        n_brs = split[0].replace('br', '')
        this_size = np.nan
    elif 'ft2' in split[0]:
        # It's the size
        this_size = split[0].replace('ft2', '')
        n_brs = np.nan
    return float(this_size), float(n_brs)


def find_prices(results):
    prices = []
    for rw in results:
        price = rw.find('span', {'class': 'price'})
        if price is not None:
            price = float(price.text.strip('$'))
        else:
            price = np.nan
        prices.append(price)
    return prices


def find_times(results):
    times = []
    for rw in apts:
        if time is not None:
            time = time['datetime']
            time = pd.to_datetime(time)
        else:
            time = np.nan
        times.append(time)
    return times

In [21]:
results = []
pages = np.arange(0, 300, 100)

for page in pages:
    apts = scrape_craiglist_page(page)

    # Find the size of all entries
    size_text = [rw.findAll(attrs={'class': 'housing'})[0].text
                 for rw in apts]
    sizes_brs = [find_size_and_brs(stxt) for stxt in size_text]
    sizes, n_brs = zip(*sizes_brs)  # This unzips into 2 vectors

    # Find the title and link
    title = [rw.find('a', attrs={'class': 'hdrlnk'}).text
                  for rw in apts]
    links = [rw.find('a', attrs={'class': 'hdrlnk'})['href']
             for rw in apts]

    # Find the time
    time = [pd.to_datetime(rw.find('time')['datetime']) for rw in apts]
    price = find_prices(apts)

    # We'll create a dataframe to store all the data
    data = np.array([time, price, sizes, n_brs, title, links])
    col_names = ['time', 'price', 'size', 'brs', 'title', 'link']
    df = pd.DataFrame(data.T, columns=col_names)
    df = df.set_index('time')

    # Add the location variable to all entries
    results.append(df)
    
results = pd.concat(results, axis=0)
results.to_json('./data/chicago_craiglist.json', orient='records')