In [1]:
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

from random import randint
from time import sleep

from scrape_cl import *

In [2]:
start_url = 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates'


In [None]:
def get_results_urls(start_url):
    '''
    Function to get a list of all possible URLs based on total search results
    
    Input:  start_url = first page of listings to be scraped
    Output: results_urls = list of results urls with listings to scrape.
    '''
    
    response = requests.get(start_url)
    page = response.text
    soup = BeautifulSoup(page, 'html.parser')

    total_listings = int(soup.find('span', class_='totalcount').text)
    total_listings

    pages = np.arange(0, total_listings+1, 120)
    pages = pages[:len(pages)-1]

    results_urls = []

    for page in pages:
    
        url_prefix = start_url
        suffix = '&s='

        url = url_prefix + suffix + str(page)
    
        results_urls.append(url)
        
    return results_urls 

In [3]:
get_results_urls(start_url)

['https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=0',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=120',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=240',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=360',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=480',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=600',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=720',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all+dates&s=840',
 'https://sacramento.craigslist.org/search/apa?bundleDuplicates=1&availabilityMode=0&sale_date=all

In [4]:
def full_listings_scrape(start_url):
    '''
    Function to fully scrape Craigslist results of apartments/housing search. 
    
    Process: (1) Loops through results URLs
             (2) Scrapes listings page into dataframe
             (3) Scrapes individual postings from listings page and add to df
             (4) Adds df to a list of dataframes
             (5) Moves to next URL in results_urls, repeat setps 2-4 until all pages scraped
             (6) Compiles list of dfs into single dataframe
    
    Input:   start_urls
    Output:  dataframe of entire search results
    '''
    
    df_list = []
    page_counter = 1
    
    results_urls = get_results_urls(start_url)
    
    total_pages = len(results_urls)

    for url in results_urls:
    
        response = requests.get(url)
        
        # Set sleep timer to avoid request overloads
        sleep(randint(2,4))
        
        # Status updates while scraping: 
        print("Scraping page {} of {}...".format(page_counter, total_pages))
        print("")
        df = full_page_scrape(url)
        df_list.append(df)
    
        print("")
        print("Page {} of {} scrape complete!".format(page_counter, total_pages))
        print("")
    
        page_counter += 1
    
    compiled_df = pd.concat(df_list).reset_index()
    compiled_df.to_csv('Sacramento_CL_scrape.csv', index=False)
    
    return compiled_df

In [5]:
full_listings_scrape(start_url)

Scraping page 1 of 22...

Listing page scrape complete!
Number of postings scraped: 121

Individual posts scrape complete!
Number of posts scraped:  121

Page 1 of 22 scrape complete!

Scraping page 2 of 22...

Listing page scrape complete!
Number of postings scraped: 121

Individual posts scrape complete!
Number of posts scraped:  121

Page 2 of 22 scrape complete!

Scraping page 3 of 22...

Listing page scrape complete!
Number of postings scraped: 125

Individual posts scrape complete!
Number of posts scraped:  125

Page 3 of 22 scrape complete!

Scraping page 4 of 22...

Listing page scrape complete!
Number of postings scraped: 122

Individual posts scrape complete!
Number of posts scraped:  122

Page 4 of 22 scrape complete!

Scraping page 5 of 22...

Listing page scrape complete!
Number of postings scraped: 121

Individual posts scrape complete!
Number of posts scraped:  121

Page 5 of 22 scrape complete!

Scraping page 6 of 22...

Listing page scrape complete!
Number of postings 

Unnamed: 0,index,date,title,link,price,brs,sqft,hood,bath,amenities
0,0,Jan 13,Rent 3 bad. 1 . 5. Bathroom,https://sacramento.craigslist.org/apa/d/sacram...,2100,3,1344.0,Sacramento,1.5Ba,"[flooring: other, townhouse, w/d in unit, no s..."
1,1,Jan 13,"3-bedroom, 1-bath farmhouse in Elk Grove",https://sacramento.craigslist.org/apa/d/elk-gr...,2200,3,1130.0,Elk Grove,1Ba,"[cats are OK - purrr, dogs are OK - wooof, flo..."
2,2,Jan 13,"State of the Art Fitness Center, Stainless Ste...",https://sacramento.craigslist.org/apa/d/fair-o...,2116,1,700.0,"Fair Oaks, Carmichael, Orangevale, Folsom, Cit...",1Ba,[application fee details: $49 Application Fee ...
3,3,Jan 13,Huge 2 Bedroom 2 Bath - just renovated with fu...,https://sacramento.craigslist.org/apa/d/west-s...,2308,2,1233.0,,2Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
4,4,Jan 13,1x1 W/ Wrap Around Patio! Fully Upgraded!,https://sacramento.craigslist.org/apa/d/rockli...,1950,1,774.0,Rocklin,1Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
...,...,...,...,...,...,...,...,...,...,...
2737,117,Jan 6,1 Bedroom Apartment available,https://sacramento.craigslist.org/apa/d/davis-...,1300,1,575.0,"617 7th St Davis, CA.",1Ba,"[air conditioning, apartment, laundry on site,..."
2738,118,Jan 6,Land for lease prop 215 friendly,https://sacramento.craigslist.org/apa/d/feathe...,30,,,,1Ba,"[apartment, no laundry on site, off-street par..."
2739,119,Jan 6,"WiFi Available, Ceiling Fan, Fitness Center",https://sacramento.craigslist.org/apa/d/west-s...,1986,1,786.0,,1Ba,"[cats are OK - purrr, dogs are OK - wooof, apa..."
2740,120,Jan 6,1x1 $1475 Available NOW,https://sacramento.craigslist.org/apa/d/sacram...,1325,1,600.0,,1Ba,"[cats are OK - purrr, dogs are OK - wooof, flo..."
