In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
from webdriver_manager.chrome import ChromeDriverManager
import numpy as np
import pandas as pd
from time import sleep
from random import randint

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [9]:
city = 'Houston'

In [6]:
# Set url
url = 'https://houston.craigslist.org/search/apa#search=1~thumb~0~0'
browser.visit(url)
sleep(3)

# Parse the HTML
html = browser.html
soup = bs(html, 'html.parser')

# find and extract relevant data
search = soup.find_all('div', class_='result-info')

# Create empty list to store scraped data
datatoget = []

# Loop through data found and extract relevant information
for (counter,link) in enumerate(search):

#     if counter == 2:
#         break
    # scrape dates
    dates = link.find_all('div', class_='meta')[0].text.split('·')[0]

    # scrape titles
    titles = link.find_all('a', class_='titlestring')[0].text

    # scrape links
    links = link.find_all('a', class_='titlestring', href=True)[0]['href']

    # scrape prices
    try:
        prices = link.find_all('span', class_='priceinfo')[0].text
    except:
        prices = np.nan
        
    # scrape neighborhoods and handle any errors if no neighborhood found
    try:
        neighborhoods = link.find_all('div', class_='supertitle')[0].text
        
    except:
        neighborhoods = np.nan

    # error handeling in case there are no bedrooms displayed
    try:
        bedrooms = link.find('span', class_='post-bedrooms').text

    except:
        bedrooms = np.nan

    # error handeling in case there are no sqft displayed
    try:
        sqft = link.find('span', class_='post-sqft').text.split()[0][:-3]
    except:
        sqft = np.nan
    
    # visit each link seperately
    browser.visit(links)
    
    # ensures that the server does not get overloaded with requests
    sleep(randint(2,5))  

    # after visiting link, extract bath and amenities data
    html = browser.html
    soup = bs(html, 'html.parser')
    baths = soup.find_all('p', class_='attrgroup')[0].text.split('\n')[1].split()[2][:-2]

    
    amenities = soup.find_all('p', class_='attrgroup')[1].text.split('\n')
    # list comprehension used to make sure no empty strings get added to the list
    all_amenities = [am for am in amenities if len(am) >= 1]
    
    # print statements to ensure that the correct data is scraped
    print(f'\nListing Number: {counter+1}')
    print(f'Square Footage: {sqft if sqft == sqft else "not available"}')
    print(f'Price: {prices if prices == prices else "not available"}')
    print(f'Title: {titles}')
    print(f'Date: {dates}')
    print(f'Number of Bedrooms: {bedrooms if bedrooms == bedrooms else "not available"}')
    print(f'Neighborhood: {neighborhoods if neighborhoods == neighborhoods else "not available"}')
    print(f'Number of Bathrooms: {baths if baths == baths else "not available"}')
    print(f'Link: {links}')
    print(f'Amenities: {all_amenities}')

    # create dicitonary for our DataFrame
    data = {'date': dates,
            'title': titles,
            'link': links,
            'price': prices,
            'bedroom': bedrooms,
            'sqft': sqft,
            'neighborhood': neighborhoods,
            'bathroom': baths,
            'amenities': all_amenities}
    
    # append the data
    datatoget.append(data)
    
browser.quit()       


Listing Number: 1
Square Footage: 722
Price: $822
Title: Gated Community, Pet Park, Black Appliance Packages Available
Date: 1/11
Number of Bedrooms: 1br
Neighborhood: 8323 Cinnamon Ln, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-gated-community-pet-park-black/7568580097.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 2
Square Footage: 1076
Price: $1,212
Title: All New Energy Efficient A/C Units, Playground, Laundry Facilities
Date: 1/11
Number of Bedrooms: 2br
Neighborhood: 7000 Fonvilla St, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-all-new-energy-efficient-c/7573016764.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number:


Listing Number: 20
Square Footage: 640
Price: $850
Title: Covered Parking, Black appliances, New Energy Efficient Windows
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: 7511 Beechnut Street, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-covered-parking-black/7570031549.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 21
Square Footage: 756
Price: $1,059
Title: Pool, Fitness Center, Covered Parking
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: Creekwood Landing Dr, Richwood, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/clute-pool-fitness-center-covered/7568670236.html
Amenities: ['open house dates', 'thursday 2022-12-15', 'friday 2022-12-16', 'saturday 2022-12-17']

Listing Number: 22
Square Footage: 900
Price: $1,370
Title: Private Patios, Reserved Parking, Tiled


Listing Number: 39
Square Footage: 637
Price: $899
Title: On Site Maintenance, On Site Laundry Coming Soon! Smoke Free Building
Date: 1/10
Number of Bedrooms: not available
Neighborhood: 14703 Park Row, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-on-site-maintenance-on-site/7575770915.html
Amenities: ['apartment', 'no laundry on site', 'off-street parking', 'rent period: monthly']

Listing Number: 40
Square Footage: 860
Price: $1,080
Title: Courtyard Views, Black appliances, Gated Community
Date: 1/10
Number of Bedrooms: 2br
Neighborhood: 7511 Beechnut Street, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-courtyard-views-black/7572353297.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 41
Square Footage: 1075
Price: $1,288
Title: Black Appliance Pack


Listing Number: 57
Square Footage: 1025
Price: $1,465
Title: Spacious Walk-in Closets, Online Payments Available, Movie Theater
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: not available
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-spacious-walk-in-closets-online/7574996837.html
Amenities: ['apartment', 'w/d in unit', 'detached garage', 'rent period: daily']

Listing Number: 58
Square Footage: 722
Price: $822
Title: Courtesy Patrol, Hardwood Style Flooring, Metro Bus Line Stop Nearby
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: 8323 Cinnamon Ln, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-courtesy-patrol-hardwood-style/7568901355.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 59
Square Footage: not available
Price: $1,199
Title: Luxury At the Best Price!
Date: 1/10


Listing Number: 75
Square Footage: 647
Price: $1,941
Title: Ahh, the comfortable life... Rentals in Houston. 1 Beds, 1 Baths
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: not available
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-ahh-the-comfortable-life/7577165583.html
Amenities: ['apartment', 'w/d hookups', 'attached garage', 'rent period: monthly']

Listing Number: 76
Square Footage: not available
Price: $2,135
Title: We Work With All Credit Types - Call For More Options In Cypress
Date: 1/10
Number of Bedrooms: 4br
Neighborhood: Cypress
Number of Bathrooms: 2
Link: https://houston.craigslist.org/apa/d/katy-we-work-with-all-credit-types-call/7577166287.html
Amenities: ['cats are OK - purrr', 'dogs are OK - wooof', 'house', 'w/d hookups', 'attached garage', 'rent period: monthly']

Listing Number: 77
Square Footage: not available
Price: $2,100
Title: Working With All Credit/Background Types - 4 bedroom 21/2
Date: 1/10
Number of Bedrooms: 4br
Neighborh


Listing Number: 94
Square Footage: 1203
Price: $1,362
Title: Metro Bus Line Stop Nearby, Quaint Neighborhood, Swimming Pool
Date: 1/10
Number of Bedrooms: 3br
Neighborhood: 7900 Bellaire Blvd, Houston, TX
Number of Bathrooms: 2
Link: https://houston.craigslist.org/apa/d/houston-metro-bus-line-stop-nearby/7572947097.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry on site', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 95
Square Footage: 800
Price: $1,000
Title: FREE Apartment Locating
Date: 1/10
Number of Bedrooms: 2br
Neighborhood: Houston
Number of Bathrooms: 2
Link: https://houston.craigslist.org/apa/d/houston-free-apartment-locating/7577149242.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'w/d in unit', 'attached garage', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 96
Square Footage: 1103
Price: $1,250
Title: ONLY 1 Left - P


Listing Number: 114
Square Footage: 1025
Price: $1,465
Title: Built-In Wine Racks, Limited Access Gates, Stainless Steel Appliances
Date: 1/10
Number of Bedrooms: 1br
Neighborhood: not available
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-built-in-wine-racks-limited/7574883721.html
Amenities: ['apartment', 'w/d in unit', 'detached garage', 'rent period: daily']

Listing Number: 115
Square Footage: 860
Price: $1,055
Title: Reserved Parking, New Energy Efficient Windows, Home Office
Date: 1/10
Number of Bedrooms: 2br
Neighborhood: 7511 Beechnut Street, Houston, TX
Number of Bathrooms: 1
Link: https://houston.craigslist.org/apa/d/houston-reserved-parking-new-energy/7572652009.html
Amenities: ['air conditioning', 'cats are OK - purrr', 'dogs are OK - wooof', 'apartment', 'laundry in bldg', 'no smoking', 'carport', 'rent period: monthly', 'wheelchair accessible']

Listing Number: 116
Square Footage: 1076
Price: $1,142
Title: Granite Counters, Two Sparkling Swi

In [7]:
datatoget

[{'date': '1/11',
  'title': 'Gated Community, Pet Park, Black Appliance Packages Available',
  'link': 'https://houston.craigslist.org/apa/d/houston-gated-community-pet-park-black/7568580097.html',
  'price': '$822',
  'bedroom': '1br',
  'sqft': '722',
  'neighborhood': '8323 Cinnamon Ln, Houston, TX',
  'bathroom': '1',
  'amenities': ['air conditioning',
   'cats are OK - purrr',
   'dogs are OK - wooof',
   'apartment',
   'laundry on site',
   'carport',
   'rent period: monthly',
   'wheelchair accessible']},
 {'date': '1/11',
  'title': 'All New Energy Efficient A/C Units, Playground, Laundry Facilities',
  'link': 'https://houston.craigslist.org/apa/d/houston-all-new-energy-efficient-c/7573016764.html',
  'price': '$1,212',
  'bedroom': '2br',
  'sqft': '1076',
  'neighborhood': '7000 Fonvilla St, Houston, TX',
  'bathroom': '1',
  'amenities': ['air conditioning',
   'cats are OK - purrr',
   'dogs are OK - wooof',
   'apartment',
   'laundry on site',
   'carport',
   'rent 

In [8]:
# creating DataFrame
df = pd.DataFrame(datatoget)
df

Unnamed: 0,date,title,link,price,bedroom,sqft,neighborhood,bathroom,amenities
0,1/11,"Gated Community, Pet Park, Black Appliance Pac...",https://houston.craigslist.org/apa/d/houston-g...,$822,1br,722,"8323 Cinnamon Ln, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."
1,1/11,"All New Energy Efficient A/C Units, Playground...",https://houston.craigslist.org/apa/d/houston-a...,"$1,212",2br,1076,"7000 Fonvilla St, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."
2,1/11,"Energy-efficient appliances, Elevators, Elevat...",https://houston.craigslist.org/apa/d/houston-e...,"$1,390",1br,850,,1,"[apartment, w/d in unit, detached garage, rent..."
3,1/11,"Reserved Parking, Covered Parking, Courtyard V...",https://houston.craigslist.org/apa/d/houston-r...,"$1,055",2br,860,"7511 Beechnut Street, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."
4,1/11,"Air Conditioning, New Bathtubs and Surrounds, ...",https://houston.craigslist.org/apa/d/houston-a...,$795,1br,684,"6833 Beechnut Street, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."
...,...,...,...,...,...,...,...,...,...
115,1/10,"Granite Counters, Two Sparkling Swimming Pools...",https://houston.craigslist.org/apa/d/houston-g...,"$1,142",2br,1076,"7000 Fonvilla St, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."
116,1/10,Save Money,https://houston.craigslist.org/apa/d/houston-s...,$999,1br,716,Houston,1,"[cats are OK - purrr, dogs are OK - wooof, apa..."
117,1/10,Great Deals,https://houston.craigslist.org/apa/d/houston-g...,$864,1br,700,Houston,1,"[cats are OK - purrr, dogs are OK - wooof, apa..."
118,1/10,"Courtesy Patrol, Gated Community, Home Office",https://houston.craigslist.org/apa/d/houston-c...,"$1,100",2br,900,"9550 Long Point Road, Houston, TX",1,"[air conditioning, cats are OK - purrr, dogs a..."


In [10]:
# Exporting data to csv
df.to_csv(f'{city}_data_raw.csv', index=False)