In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import numpy as np
import pandas as pd
from time import sleep
from random import randint

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
city = 'Los Angeles'

In [4]:
# Set url
url = 'https://losangeles.craigslist.org/search/lac/hhh#search=1~thumb~0~0'
browser.visit(url)
sleep(3)

# Parse the HTML
html = browser.html
soup = bs(html, 'html.parser')

# find and extract relevant data
search = soup.find_all('div', class_='result-info')

# Create empty list to store scraped data
datatoget = []

# Loop through data found and extract relevant information
for (counter,link) in enumerate(search):

    if counter == 5:
        break
    # scrape dates
    dates = link.find_all('div', class_='meta')[0].text.split()[0][:3]

    # scrape titles
    titles = link.find_all('a', class_='titlestring')[0].text

    # scrape links
    links = link.find_all('a', class_='titlestring', href=True)[0]['href']

    try:
    # scrape prices
        prices = link.find_all('span', class_='priceinfo')[0].text
    except:
        prices = np.nan

    # scrape neighborhoods
    neighborhoods = link.find('div', class_='supertitle').text

    # error handeling in case there are no bedrooms displayed
    try:
        bedrooms = link.find('span', class_='post-bedrooms').text

    except:
        bedrooms = np.nan

    # error handeling in case there are no sqft displayed
    try:
        sqft = link.find('span', class_='post-sqft').text.split()[0][:-3]
    except:
        sqft = np.nan
    

    
    # visit each link seperately
    browser.visit(links)
    
    # ensures that the server does not get overloaded with requests
    sleep(2)
    #     sleep(randint(1,4))  

    # after visiting link, extract bath and amenities data
    html = browser.html
    soup = bs(html, 'html.parser')
    baths = soup.find_all('p', class_='attrgroup')[0].text.split('\n')[1].split()[2][:-2]

    
    amenities = soup.find_all('p', class_='attrgroup')[1].text.split('\n')
    # list comprehension used to make sure no empty strings get added to the list
    all_amenities = [am for am in amenities if am != '']
    
    # print statements to ensure that the correct data is scraped
    print(f'\nListing Number: {counter+1}')
    print(f'Square Footage: {sqft if sqft == sqft else "not available"}')
    print(f'Price: {prices}')
    print(f'Title: {titles}')
    print(f'Date: {dates}')
    print(f'Number of Bedrooms: {bedrooms}')
    print(f'Neighborhood: {neighborhoods}')
    print(f'Number of Bathrooms: {baths}')
    print(f'Link: {links}')
    print(f'Amenities: {all_amenities}')

    # create dicitonary for our DataFrame
    data = {'date': dates,
            'title': titles,
            'link': links,
            'price': prices,
            'bedroom': bedrooms,
            'sqft': sqft,
            'neighborhood': neighborhoods,
            'bathroom': baths,
            'amenities': all_amenities}
    
    # append the data
    datatoget.append(data)

browser.quit()      


Listing Number: 1
Square Footage: 652
Price: $2,353
Title: Designer Granite Countertops, On Site Maintenance
Date: 1/9
Number of Bedrooms: 1br
Neighborhood: 930 Figuroa Terrace, Los Angeles, CA
Number of Bathrooms: 1
Link: https://losangeles.craigslist.org/lac/apa/d/los-angeles-designer-granite/7576117643.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'apartment', 'w/d in unit', 'attached garage', 'rent period: monthly']

Listing Number: 2
Square Footage: 1004
Price: $3,250
Title: Fully Outfitted and Quality 2+2 in Koreatown ** SS Appliances ** F/P
Date: 1/9
Number of Bedrooms: 2br
Neighborhood: Hancock Park
Number of Bathrooms: 2
Link: https://losangeles.craigslist.org/lac/apa/d/los-angeles-fully-outfitted-and-quality/7576817257.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'no smoking', 'off-street parking', 'rent period: monthly']

Listing Number: 3
Square Footage: not available
Price: $1,750
Title: *

In [None]:
datatoget

In [None]:
# creating DataFrame
df = pd.DataFrame(datatoget)
df

In [None]:
# Exporting data to csv
df.to_csv(f"{city.replace(' ', '_')}_data_raw.csv", index=False)