# Data Pull

## Importing Libraries

In [31]:
import pandas as pd
import requests
from requests import get
from bs4 import BeautifulSoup
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## Perform Web Scrape from Craigslist in Los Angeles

**A Note on the Default Search**

The default starting page will have certain characteristics already standardized for the purposes of this project. I have narrowed the focus to only 1 bedroom, 1 bathroom apartments. There is also a specific map area being used. The map view is an area centered around Santa Monica, and encompasses the West Los Angeles region north of Manahattan Beach and south of Pacific Palisades. This area contains a high density of apartments, giving us a steady supply of data to pull.

This strategy pulls many results and isolates two big cost factors in the forthcoming regression equation. This allows us to better view the effects of engineered features, which we will perform later. 

Our goal here is to cater the data science insights to me (the author) first with an eye towards scaling to a potential use case by anyone. Thus, while we will engineer the data infrastructure with an eye towards scaling the data quantity and features, we want to narrow the insights to be useful to at least one person (myself) before we expand further. This lets us behave pragmatically within the time constraints of a 7-week project. 

#### First, we declare global variables that will be used in our code.

In [2]:
## Global Variables ##

craigslist_base_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postal=90095&search_distance=3.6#search=1'
craigslist_search_first_page_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postal=90095&search_distance=3.6#search=1~list~0~0'
chrome_driver_path = '../Other_Material/chromedriver-mac-arm64/chromedriver'

#### We will be using BeautifulSoup to access URLs in this notebook, so we write a function to perform this operation now.

In [59]:
# Create the access_beautiful_soup function
def access_beautiful_soup(url):
    # Call a get instance with the URL
    response = requests.get(url)

    # Sleep in order to not overwhelm servers
    time.sleep(5 + 10 * random.random())

    # Find all the listings links on the page
    soup = BeautifulSoup(response.text, 'html.parser')

    return soup

#### Before we access the URLs of the listings, we need to find out how many there are in the search query. 

Since this information is not accessible via BeautifulSoup based on the way the Craigslist HTML structure is set up, we need to use Selenium. 

We now proceed with implementing the Selenium code to get the total listings amount. We write a function called "get_postings_count" that takes in the two arguments "website" and "path" and returns the post count. We need to use JavaScript here, because the post count is loaded dynamically into the webpage. 

In [3]:
# Function to return the number of posts at a given time
def get_postings_count(website, path):
    
    # prevent a window from opening in Selenium
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    
    # set up the Chrome driver path for Selenium usage
    service = Service(path)
    driver = webdriver.Chrome(service=service, options=options)

    # Call a "get" instance of the initial Craigslist page to initialize Selenium
    driver.get(website)

    # Use a waiting period to make sure all the elements load for Selenium to inspect
    wait = WebDriverWait(driver, 10)  # Wait for up to 10 seconds

    try:
        # Wait for the specific element to be present before executing the script
        postings_count_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.cl-count-save-bar > div')))

        # Use JavaScript to set up a script to return the postings count
        postings_count_script = """
            var postingsDiv = document.querySelector('.cl-count-save-bar > div');
            return postingsDiv ? postingsDiv.textContent : 'Postings count not found';
        """

        # Execute the script to get the post count and return it
        postings_count = driver.execute_script(postings_count_script)
        return postings_count
    finally:
        # Exits Selenium
        driver.quit()

#### Let's call the function now to get the postings count.

In [8]:
# Call the get_postings_count function
postings_count = get_postings_count(craigslist_search_first_page_url, chrome_driver_path)

#### Finally, we print the amount of posts to see how many we are working with.

In [9]:
# Check to see how many postings there are
print(postings_count)

2,353 postings


#### We can now use the post count to get the number of pages to loop through. 

There are 120 posts per page, so we want to extract the post count and divide it by 120.

In [10]:
# Function to calculate the number of pages for us to loop through
def calculate_pages_from_postings(postings_count_str):
    
    # Remove commas and extract the numerical part of the string
    num_postings = int(postings_count_str.replace(" postings", "").replace(",", ""))
    
    # 120 posts per page
    postings_per_page = 120
    
    # Calculate the number of pages needed to display all postings, accounting for remainder
    num_pages = -(-num_postings // postings_per_page)  
    
    return num_pages

In [11]:
# Call the calculate_pages_from_postings function
number_of_pages = calculate_pages_from_postings(postings_count)
print(f"Number of pages: {number_of_pages}")

Number of pages: 20


#### Next, we write a function to extract the links from each of the pages. We do this by: 

1. initializing a list
2. looping through the number of pages
3. finding all the 'li' tags within page that use the class 'cl-static-search-result'
4. finding the 'a' tags within the 'li' tags, which contain the link to the individual listing
5. appending these links to the list

In [47]:
# Create the extract_listings_links function
def extract_listings_links(number_of_pages):
    
    # Initialize a list to store the links
    listings_links = []

    # Iterate through the number of pages
    for i in range(number_of_pages):  

        # Use the page number to get the webpages containing the listings
        page_number = i
        page_url = f'{craigslist_base_url}~list~{page_number}~0'

        # Access the URL using beautiful soup function
        soup = access_beautiful_soup(page_url)

        # Look for all 'li' tags with the class 'cl-static-search-result'
        listings = soup.find_all('li', class_='cl-static-search-result')

        # Loop through all the listings and append links to the list
        for listing in listings:
            a_tag = listing.find('a', href=True)
            if a_tag:
                listings_links.append(a_tag['href'])

    return listings_links

In [13]:
# Call the extract_listings_links function and store the returned list in the 'all_links' variable
all_links = extract_listings_links(number_of_pages)

Let's check out the "all_links" list to see the extraction was successful.

In [14]:
print(len(all_links))

7200


We see that there is content within the all_links list. Next, let's get a sample of three of the links to ensure we pulled what we wanted.

In [15]:
print(f'Link #1: ',all_links[0])
print(f'Link #2: ',all_links[1])
print(f'Link #3: ',all_links[2])

Link #1:  https://losangeles.craigslist.org/wst/apa/d/venice-bedroom-in-marina-del-rey-quartz/7726576879.html
Link #2:  https://losangeles.craigslist.org/wst/apa/d/los-angeles-bedroom-ba-in-west-la/7726576204.html
Link #3:  https://losangeles.craigslist.org/wst/apa/d/los-angeles-westwood-bedroom-bath/7726575683.html


#### We now dive into working with the data within each listing link.

First, we set up a dataframe containing basic column information.

In [172]:
# Initialize the DataFrame
df_columns = ["Title", "Price", "Bedrooms", "Square Feet", "Full Address"]
listings_df = pd.DataFrame(columns=df_columns)

#### Next, we begin the process of creating boolean values for different attributes.

Each listing contains different attributes. While there is a lot of overlap, we need to see all of the options. To do this, we initialize a dictionary called "global_attribute_counts," then add unique values and count them. Ultimately, we want to create columns with these values and use boolean values "1" or "0" meaning "present" or "not present."

In [173]:
test_list = ['https://losangeles.craigslist.org/wst/apa/d/venice-bedroom-in-marina-del-rey-quartz/7726576879.html',
             'https://losangeles.craigslist.org/wst/apa/d/los-angeles-bedroom-ba-in-west-la/7726576204.html']

In [174]:
links_and_soups = {}

In [175]:
def pair_links_and_soups(list_of_links):
    for link in list_of_links:
        the_soup = access_beautiful_soup(link)
        links_and_soups[link] = the_soup

In [176]:
pair_links_and_soups(test_list)

In [188]:
# Initialize the dictionary for counting attributes across all listings
global_attribute_counts = {}

attributes = []

In [194]:
def collect_basic_information(the_soup):
    title = the_soup.find("span", id="titletextonly").text.strip()
    price = the_soup.find("span", class_="price").text.strip()
    bedroom_info = the_soup.find("span", class_="housing").text.split("/")[1].split("-")[0].strip()
    square_feet = the_soup.find("span", class_="housing").text.split("-")[1].split("ft")[0].strip()
    
    full_address_element = the_soup.find("h2", class_="street-address")
    if full_address_element:
        full_address = full_address_element.text.strip()
    else:
        full_address = "None listed"

    return title, price, bedroom_info, square_feet, full_address

In [189]:
# Define the count_attributes_function to view all the attributes used in apartment listings
def process_attributes(the_soup):
    attribute_search = the_soup.find_all('div', class_='attr')
    attributes = []
    for listing in attribute_search:
        value_span = listing.find('span', class_='valu')
        if value_span:
            attribute = value_span.text.strip()
            attributes.append(attribute)
            # Update global attribute counts
            global_attribute_counts[attribute] = global_attribute_counts.get(attribute, 0) + 1 
            
    return attributes

In [191]:
# Define the count_attributes_function to view all the attributes used in apartment listings
def create_dataframe(links_and_soups, listings_df):
    localized_df = listings_df.copy()
    
    for link, soup in links_and_soups.items():

        title, price, bedroom_info, square_feet, full_address = collect_basic_information(soup)
        attributes = process_attributes(soup)
        
        # Append the information as a new row in the DataFrame
        new_row = pd.Series([title, price, bedroom_info, square_feet, full_address], index=df_columns)
        localized_df = pd.concat([localized_df, new_row.to_frame().T], ignore_index=True)
    
    return localized_df

In [192]:
listings_df = create_dataframe(links_and_soups, listings_df)

In [193]:
global_attribute_counts

{'monthly': 2,
 'air conditioning': 1,
 'cats are OK - purrr': 2,
 'apartment': 2,
 'laundry on site': 2,
 'off-street parking': 2}

In [169]:
listings_df

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address
0,1 Bedroom in Marina Del Rey -Quartz Counters -...,"$3,295",1br,750,"415 Washington Boulevard, Venice, CA 90292"
1,1 Bedroom 1 BA in West L.A. | Hardwood Style F...,"$2,250",1br,700,None listed
