In [2]:
import pandas as pd
import os
import requests
from datetime import datetime
from requests import get
from bs4 import BeautifulSoup
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

In [None]:
def pull_craigslist_links():
    # Define the URLs
    craigslist_base_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postedToday=1&search_distance=3.6#search=1'
    craigslist_search_first_page_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postedToday=1&search_distance=3.6#search=1~list~0~0'
    chrome_driver_path = 'Other_Material/chromedriver-mac-arm64/chromedriver'

    # Create the access_beautiful_soup function
    def access_beautiful_soup(url):
        # Call a get instance with the URL
        response = requests.get(url)
    
        # Sleep in order to not overwhelm servers
        time.sleep(5 + 10 * random.random())
    
        # Find all the listings links on the page
        soup = BeautifulSoup(response.text, 'html.parser')
    
        return soup

    # Function to return the number of posts at a given time
    def get_postings_count(website, path):
        
        # # prevent a window from opening in Selenium
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        
        # set up the Chrome driver path for Selenium usage
        service = Service(path)
        driver = webdriver.Chrome(service=service, options=options)
    
        # Call a "get" instance of the initial Craigslist page to initialize Selenium
        driver.get(website)
    
        # Put in a function stopper to let the page render.
        # 15 seconds should be plenty, but if the result is coming back as
        # "no results," the first troubleshoot would be to increase this time
        # to see if that fixes it.
        time.sleep(15)  
    
        try:
    
            # Use JavaScript to set up a script to return the postings count
            postings_count_script = """
                var postingsDiv = document.querySelector('.cl-count-save-bar > div');
                return postingsDiv ? postingsDiv.textContent : 'Postings count not found';
            """
    
            # Execute the script to get the post count and return it
            postings_count = driver.execute_script(postings_count_script)
    
            # Quit Selenium
            driver.quit()
    
            # Return the postings count
            return postings_count
        except Exception as e:
            print(f"Error encountered: {e}")

    # Call the get_postings_count function
    postings_count = get_postings_count(craigslist_search_first_page_url, chrome_driver_path)

    # Function to calculate the number of pages for us to loop through
    def calculate_pages_from_postings(postings_count_str):
        
        # Remove commas and extract the numerical part of the string
        num_postings = int(postings_count_str.replace(" postings", "").replace(",", ""))
        
        # 120 posts per page
        postings_per_page = 120
        
        # Calculate the number of pages needed to display all postings, accounting for remainder
        num_pages = -(-num_postings // postings_per_page)  
        
        return num_pages

    # Call the calculate_pages_from_postings function
    number_of_pages = calculate_pages_from_postings(postings_count)

    # Extract the links from each of the pages
    def extract_listing_links(path, base_url, number_of_pages):
        all_listing_links = []
        
        for page_number in range(number_of_pages):
            page_url = f'{base_url}~list~{page_number}~0'
            
            # prevent a window from opening in Selenium
            options = Options()
            options.add_argument('--headless')
            options.add_argument('--disable-gpu')
            
            # set up the Chrome driver path for Selenium usage
            service = Service(path)
            driver = webdriver.Chrome(service=service, options=options)
            
            driver.get(page_url)
            
            # Wait for the listings to be present
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.cl-search-result.cl-search-view-mode-list"))
            )
            # Now that the page is loaded, find all the `a` tags within the listings
            listing_links = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, "li.cl-search-result.cl-search-view-mode-list a")]
    
            all_listing_links.extend(listing_links)
    
            driver.quit()
            
        return all_listing_links

    all_links = extract_listing_links(chrome_driver_path, craigslist_base_url, number_of_pages)

    # Convert the list of links to a DataFrame
    links_df = pd.DataFrame(all_links, columns=['URL'])

    # Save the DataFrame to a CSV file
    links_df.to_csv('craigslist_links.csv', index=False)

In [13]:
def pull_craigslist_rates_from_links():

    # Initialize the DataFrame
    df_columns = ["Title", "Price", "Bedrooms", "Square Feet", "Full Address"]
    todays_listings_df = pd.DataFrame(columns=df_columns)

    # Set up links_and_soups dict to be used in pair_links_and_soups function
    links_and_soups = {}

    # Function to pair links with soup content
    def pair_links_and_soups(list_of_links):
        for link in list_of_links:
            the_soup = access_beautiful_soup(link)
            links_and_soups[link] = the_soup

    # Call pair_links_and_soups
    pair_links_and_soups(all_links)

    # Define your master list of attributes
    master_attributes = [
        "Title", "Price", "Bedrooms", "Square Feet", "Full Address", "monthly",
        "apartment", "cats are OK - purrr", "dogs are OK - wooof", "laundry on site",
        "air conditioning", "off-street parking", "EV charging", "w/d in unit",
        "carport", "no smoking", "attached garage", "detached garage", "laundry in bldg",
        "Fee Needed To Apply", "wheelchair accessible", "no parking", "furnished",
        "street parking", "no laundry on site", "house", "w/d hookups", "date_added"
    ]

    # Function to initialize the DataFrame with all required columns
    def initialize_dataframe():
        # Create a DataFrame with all columns initialized to None or a suitable default
        return pd.DataFrame(columns=master_attributes)

    # Function to create a new entry for the DataFrame
    def create_new_entry(data):
        return {attr: data.get(attr, 0) if attr not in ['Title', 
                                                        'Price', 
                                                        'Bedrooms', 
                                                        'Square Feet', 
                                                        'Full Address', 
                                                        'date_added'] else data.get(attr, 0) for attr in master_attributes}
    # Set up global attribute counts dict
    global_attribute_counts = {}

    # Define the count_attributes_function to view all the attributes used in apartment listings
    def process_attributes(the_soup):
        attribute_search = the_soup.find_all('div', class_='attr')
        attributes = []
        fee_needed = 0  # Initialize a flag for fees
    
        fee_pattern = re.compile(r'\b\d+\b')  # Regex to identify fee-related attributes
    
        for listing in attribute_search:
            value_span = listing.find('span', class_='valu')
            if value_span:
                attribute = value_span.text.strip()
                global_attribute_counts[attribute] = global_attribute_counts.get(attribute, 0) + 1 
                if fee_pattern.search(attribute):  # Check if attribute suggests a fee
                    fee_needed = 1
                else:
                    attributes.append(attribute)  # Only add non-fee attributes to the list
    
        return attributes, fee_needed

    # Run process_attributes using the info in the links_and_soups dictionary
    for link, soup in links_and_soups.items():
        attributes = process_attributes(soup)

    # Storing the object results in a variable called "raw_attributes" 
    raw_attributes = global_attribute_counts

    # Function to group together fee related attributes
    def clean_up_the_fees(attributes_dictionary):
        
        # Initialize a count for "Fees Needed To Apply Key"
        fees_needed_to_apply = 0
    
        # Set up a Regex to identify keys containing integers
        # We will use the re package to do this
        fee_pattern = re.compile(r'\b\d+\b')
    
        # Iterate through the dictionary, summing up counts for fee-related attributes
        for key, value in raw_attributes.items():
            if fee_pattern.search(key):
                fees_needed_to_apply += value
        
        # Update the dictionary and add in a key called "Fee Needed To Apply"
        cleaned_attributes = {key: value for key, value in raw_attributes.items() if not fee_pattern.search(key)}
        cleaned_attributes["Fee Needed To Apply"] = fees_needed_to_apply
    
        return cleaned_attributes

    # Run the clean_up_the_fees function using the raw_attributes as input
    cleaned_attributes = clean_up_the_fees(raw_attributes)

    # Sort the cleaned_attributes in descending order of instance count
    cleaned_attributes = dict(sorted(cleaned_attributes.items(), key=lambda item: item[1], reverse=True))

    # Collecting basic info from the soup content
    def collect_basic_information(the_soup):
        title_element = the_soup.find("span", id="titletextonly")
        title = title_element.text.strip() if title_element else "Title Not Found"
        
        price_element = the_soup.find("span", class_="price")
        price = price_element.text.strip() if price_element else "Price Not Found"
        
        housing_element = the_soup.find("span", class_="housing")
        if housing_element:
            try:
                bedroom_info = housing_element.text.split("/")[1].split("-")[0].strip()
                square_feet = housing_element.text.split("-")[1].split("ft")[0].strip()
            except IndexError:
                bedroom_info = "Bedrooms Info Not Found"
                square_feet = "Square Feet Not Found"
        else:
            bedroom_info = "Bedrooms Info Not Found"
            square_feet = "Square Feet Not Found"
        
        full_address_element = the_soup.find("h2", class_="street-address")
        full_address = full_address_element.text.strip() if full_address_element else "None listed"
    
        return {
            "Title": title,
            "Price": price,
            "Bedrooms": bedroom_info,
            "Square Feet": square_feet,
            "Full Address": full_address
        }

    def create_dataframe(links_and_soups):
        all_entries = []
        
        for link, soup in links_and_soups.items():
            basic_info = collect_basic_information(soup)
            listing_attributes, fee_needed = process_attributes(soup)
            new_row_data = {
                **basic_info,
                **{attr: 1 if attr in listing_attributes else 0 for attr in cleaned_attributes},
                "Fee Needed To Apply": fee_needed,
                "date_added": datetime.now().strftime('%Y-%m-%d')
            }
            
            # Create a new entry ensuring all master attributes are included
            complete_entry = create_new_entry(new_row_data)
            all_entries.append(complete_entry)
        
        return pd.DataFrame(all_entries)

    # Create the dataframe
    todays_listings_df = create_dataframe(links_and_soups)

    # Export data
    todays_listings_df.to_csv('files/raw_daily_craigslist_listings.csv', index = False)

In [14]:
pull_craigslist_rates_main()

In [2]:
# Define the URLs
craigslist_base_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postedToday=1&search_distance=3.6#search=1'
craigslist_search_first_page_url = 'https://losangeles.craigslist.org/search/santa-monica-ca/apa?lat=34.0315&lon=-118.461&max_bathrooms=1&max_bedrooms=1&min_bathrooms=1&min_bedrooms=1&postedToday=1&search_distance=3.6#search=1~list~0~0'
chrome_driver_path = '../Other_Material/chromedriver-mac-arm64/chromedriver'

In [3]:
# Create the access_beautiful_soup function
def access_beautiful_soup(url):
    # Call a get instance with the URL
    response = requests.get(url)

    # Sleep in order to not overwhelm servers
    time.sleep(5 + 10 * random.random())

    # Find all the listings links on the page
    soup = BeautifulSoup(response.text, 'html.parser')

    return soup

In [4]:
# Function to return the number of posts at a given time
def get_postings_count(website, path):
    
    # # prevent a window from opening in Selenium
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    
    # set up the Chrome driver path for Selenium usage
    service = Service(path)
    driver = webdriver.Chrome(service=service, options=options)

    # Call a "get" instance of the initial Craigslist page to initialize Selenium
    driver.get(website)

    # Put in a function stopper to let the page render.
    # 15 seconds should be plenty, but if the result is coming back as
    # "no results," the first troubleshoot would be to increase this time
    # to see if that fixes it.
    time.sleep(15)  

    try:

        # Use JavaScript to set up a script to return the postings count
        postings_count_script = """
            var postingsDiv = document.querySelector('.cl-count-save-bar > div');
            return postingsDiv ? postingsDiv.textContent : 'Postings count not found';
        """

        # Execute the script to get the post count and return it
        postings_count = driver.execute_script(postings_count_script)

        # Quit Selenium
        driver.quit()

        # Return the postings count
        return postings_count
    except Exception as e:
        print(f"Error encountered: {e}")

In [5]:
# Call the get_postings_count function
postings_count = get_postings_count(craigslist_search_first_page_url, chrome_driver_path)

In [6]:
# Check to see how many postings there are
print(postings_count)

93 postings


In [7]:
# Function to calculate the number of pages for us to loop through
def calculate_pages_from_postings(postings_count_str):
    
    # Remove commas and extract the numerical part of the string
    num_postings = int(postings_count_str.replace(" postings", "").replace(",", ""))
    
    # 120 posts per page
    postings_per_page = 120
    
    # Calculate the number of pages needed to display all postings, accounting for remainder
    num_pages = -(-num_postings // postings_per_page)  
    
    return num_pages

In [8]:
# Call the calculate_pages_from_postings function
number_of_pages = calculate_pages_from_postings(postings_count)
print(f"Number of pages: {number_of_pages}")

Number of pages: 1


In [9]:
# Extract the links from each of the pages
def extract_listing_links(path, base_url, number_of_pages):
    all_listing_links = []
    
    for page_number in range(number_of_pages):
        page_url = f'{base_url}~list~{page_number}~0'
        
        # prevent a window from opening in Selenium
        options = Options()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        
        # set up the Chrome driver path for Selenium usage
        service = Service(path)
        driver = webdriver.Chrome(service=service, options=options)
        
        driver.get(page_url)
        
        # Wait for the listings to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.cl-search-result.cl-search-view-mode-list"))
        )
        # Now that the page is loaded, find all the `a` tags within the listings
        listing_links = [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, "li.cl-search-result.cl-search-view-mode-list a")]

        all_listing_links.extend(listing_links)

        driver.quit()
        
    return all_listing_links

In [10]:
all_links = extract_listing_links(chrome_driver_path, craigslist_base_url, number_of_pages)

print(len(all_links))

93


In [11]:
print(f'Link #1: ',all_links[0])
print(f'Link #2: ',all_links[1])
print(f'Link #3: ',all_links[2])

Link #1:  https://losangeles.craigslist.org/wst/apa/d/los-angeles-renovated-1br-apartment-in/7747671752.html
Link #2:  https://losangeles.craigslist.org/wst/apa/d/santa-monica-situated-in-santa-monica/7747671511.html
Link #3:  https://losangeles.craigslist.org/wst/apa/d/los-angeles-beautiful-one-bedroom/7747664102.html


In [12]:
# Initialize the DataFrame
df_columns = ["Title", "Price", "Bedrooms", "Square Feet", "Full Address"]
todays_listings_df = pd.DataFrame(columns=df_columns)

# Set the max columns to infinite so that we may view all of them
pd.set_option('display.max_columns', None)

In [13]:
links_and_soups = {}

In [14]:
# Function to pair links with soup content
def pair_links_and_soups(list_of_links):
    for link in list_of_links:
        the_soup = access_beautiful_soup(link)
        links_and_soups[link] = the_soup

In [15]:
pair_links_and_soups(all_links)

In [16]:
print(len(links_and_soups))

93


In [17]:
# Define your master list of attributes
master_attributes = [
    "Title", "Price", "Bedrooms", "Square Feet", "Full Address", "monthly",
    "apartment", "cats are OK - purrr", "dogs are OK - wooof", "laundry on site",
    "air conditioning", "off-street parking", "EV charging", "w/d in unit",
    "carport", "no smoking", "attached garage", "detached garage", "laundry in bldg",
    "Fee Needed To Apply", "wheelchair accessible", "no parking", "furnished",
    "street parking", "no laundry on site", "house", "w/d hookups", "date_added"
]

In [18]:
# Function to initialize the DataFrame with all required columns
def initialize_dataframe():
    # Create a DataFrame with all columns initialized to None or a suitable default
    return pd.DataFrame(columns=master_attributes)

In [19]:
# Function to create a new entry for the DataFrame
def create_new_entry(data):
    return {attr: data.get(attr, 0) if attr not in ['Title', 
                                                    'Price', 
                                                    'Bedrooms', 
                                                    'Square Feet', 
                                                    'Full Address', 
                                                    'date_added'] else data.get(attr, 0) for attr in master_attributes}

In [20]:
global_attribute_counts = {}

In [21]:
# Define the count_attributes_function to view all the attributes used in apartment listings
def process_attributes(the_soup):
    attribute_search = the_soup.find_all('div', class_='attr')
    attributes = []
    fee_needed = 0  # Initialize a flag for fees

    fee_pattern = re.compile(r'\b\d+\b')  # Regex to identify fee-related attributes

    for listing in attribute_search:
        value_span = listing.find('span', class_='valu')
        if value_span:
            attribute = value_span.text.strip()
            global_attribute_counts[attribute] = global_attribute_counts.get(attribute, 0) + 1 
            if fee_pattern.search(attribute):  # Check if attribute suggests a fee
                fee_needed = 1
            else:
                attributes.append(attribute)  # Only add non-fee attributes to the list

    return attributes, fee_needed

In [22]:
# Run process_attributes using the info in the links_and_soups dictionary
for link, soup in links_and_soups.items():
    attributes = process_attributes(soup)

In [23]:
# Storing the object results in a variable called "raw_attributes" 
raw_attributes = global_attribute_counts

In [24]:
raw_attributes

{'monthly': 93,
 'cats are OK - purrr': 85,
 'apartment': 92,
 'dogs are OK - wooof': 66,
 'laundry in bldg': 32,
 'off-street parking': 27,
 'no smoking': 34,
 'laundry on site': 35,
 'attached garage': 27,
 'air conditioning': 65,
 'EV charging': 43,
 '$52': 10,
 'carport': 26,
 'wheelchair accessible': 27,
 'w/d in unit': 26,
 '50.00': 2,
 '5%': 2,
 'Shores Barrington, LLC': 2,
 '$40 per adult': 1,
 'no parking': 5,
 'furnished': 5,
 'valet parking': 1,
 'detached garage': 7,
 '35': 1,
 'cottage/cabin': 1,
 'Apply for rentals application forms': 1}

In [25]:
# Function to group together fee related attributes
def clean_up_the_fees(attributes_dictionary):
    
    # Initialize a count for "Fees Needed To Apply Key"
    fees_needed_to_apply = 0

    # Set up a Regex to identify keys containing integers
    # We will use the re package to do this
    fee_pattern = re.compile(r'\b\d+\b')

    # Iterate through the dictionary, summing up counts for fee-related attributes
    for key, value in raw_attributes.items():
        if fee_pattern.search(key):
            fees_needed_to_apply += value
    
    # Update the dictionary and add in a key called "Fee Needed To Apply"
    cleaned_attributes = {key: value for key, value in raw_attributes.items() if not fee_pattern.search(key)}
    cleaned_attributes["Fee Needed To Apply"] = fees_needed_to_apply

    return cleaned_attributes

In [26]:
# Run the clean_up_the_fees function using the raw_attributes as input
cleaned_attributes = clean_up_the_fees(raw_attributes)

In [27]:
# Sort the cleaned_attributes in descending order of instance count
cleaned_attributes = dict(sorted(cleaned_attributes.items(), key=lambda item: item[1], reverse=True))

In [31]:
def collect_basic_information(the_soup):
    title_element = the_soup.find("span", id="titletextonly")
    title = title_element.text.strip() if title_element else "Title Not Found"
    
    price_element = the_soup.find("span", class_="price")
    price = price_element.text.strip() if price_element else "Price Not Found"
    
    housing_element = the_soup.find("span", class_="housing")
    if housing_element:
        try:
            bedroom_info = housing_element.text.split("/")[1].split("-")[0].strip()
            square_feet = housing_element.text.split("-")[1].split("ft")[0].strip()
        except IndexError:
            bedroom_info = "Bedrooms Info Not Found"
            square_feet = "Square Feet Not Found"
    else:
        bedroom_info = "Bedrooms Info Not Found"
        square_feet = "Square Feet Not Found"
    
    full_address_element = the_soup.find("h2", class_="street-address")
    full_address = full_address_element.text.strip() if full_address_element else "None listed"

    return {
        "Title": title,
        "Price": price,
        "Bedrooms": bedroom_info,
        "Square Feet": square_feet,
        "Full Address": full_address
    }

In [32]:
def create_dataframe(links_and_soups):
    all_entries = []
    
    for link, soup in links_and_soups.items():
        basic_info = collect_basic_information(soup)
        listing_attributes, fee_needed = process_attributes(soup)
        new_row_data = {
            **basic_info,
            **{attr: 1 if attr in listing_attributes else 0 for attr in cleaned_attributes},
            "Fee Needed To Apply": fee_needed,
            "date_added": datetime.now().strftime('%Y-%m-%d')
        }
        
        # Create a new entry ensuring all master attributes are included
        complete_entry = create_new_entry(new_row_data)
        all_entries.append(complete_entry)
    
    return pd.DataFrame(all_entries)

In [35]:
# Create the dataframe
todays_listings_df = create_dataframe(links_and_soups)

Unnamed: 0,Title,Price,Bedrooms,Square Feet,Full Address,monthly,apartment,cats are OK - purrr,dogs are OK - wooof,laundry on site,air conditioning,off-street parking,EV charging,w/d in unit,carport,no smoking,attached garage,detached garage,laundry in bldg,Fee Needed To Apply,wheelchair accessible,no parking,furnished,street parking,no laundry on site,house,w/d hookups,date_added
0,Renovated 1BR Apartment in West LA - Hardwood ...,"$2,515",1br,668,None listed,1,1,1,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,2024-05-17
1,"Situated in Santa Monica!, 1/BD 1/BA, Hardwood...","$2,471",1br,382,"1447 Lincoln Blvd, Santa Monica, CA 90401",1,1,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2024-05-17
2,Beautiful One Bedroom Apartment - The Lifestyl...,"$3,405",1br,870,"550 South Barrington Avenue, Los Angeles, CA 9...",1,1,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2024-05-17
3,"Magnificent luxury building. 1 bed, 1 ba, 496 ...","$2,895",1br,496,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,2024-05-17
4,Magnificent amenities. Great location. Secure ...,"$3,255",1br,516,"3644 Overland Ave, Los Angeles, CA 90034",1,1,1,1,0,1,0,1,1,1,1,0,0,0,1,1,0,0,0,0,0,0,2024-05-17


In [1]:
todays_listings_df.to_csv('/files/raw_daily_craigslist_listings.csv', index = False)

NameError: name 'todays_listings_df' is not defined