In [3]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Testing
linked = 'https://www.century21.com/real-estate/california/LSCA/'

response = requests.get(linked)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find_all('div', class_='infinite-item')
result

[<div class="infinite-item property-card clearfix property-card-C2180910323" data-brand-cd="C21" data-id="C2180910323" data-latitude="38.02535" data-link="/property/22216-parrotts-ferry-13-sonora-ca-95370-C2180910323" data-listing-id="" data-longitude="-120.40381" data-mls="20181467" data-source-id="56dafcea-766d-461e-97bc-6ee9e5eb9983" data-zip="95370">
 <div class="property-card-clip"> <div class="property-card-image" style="background-image: url(https://www2.century21.com/c21/photo/maxxmax/c21.azureedge.net/308i0/0r6e05264hncmdte4j3a11qnf4i);">
 <div class="property-image-flag newly-listed">Newly Listed</div>
 <div class="property-image-count">
 <div class="image-count-left">&lt;</div>
 <div class="image-count-current">1</div>
 <div class="image-count-total">12</div>
 <div class="image-count-right">&gt;</div>
 </div>
 </div>
 </div>
 <div class="property-card-primary-info">
 <div class="pdp-listing-type sale">FOR SALE</div>
 <a class="listing-price" href="/property/22216-parrotts-fe

In [4]:
#Set up list to hold response info
house_dict = []

In [5]:
# Loop through x pages of the website with filter of Single House
page_link = 'https://www.century21.com/real-estate/california/LSCA/?sn=5&sk=Y&pt=1&p={}'
for link in [page_link.format(page) for page in range(1,5000)]:
    res = requests.get(link)
    new_soup = BeautifulSoup(res.text, 'html.parser')
    new_results = new_soup.find_all('div', class_='infinite-item')

    # Loop through returned results
    for result in new_results:
        # Error handling
        try:
            # Identify and return listing type
            listing_type = result.find('div', class_="pdp-listing-type").text
            # Identify and return price of House
            price = result.find('a', class_="listing-price").text
            # Identify and return number and street address of House
            street = result.find('div', class_="property-address").text
            # Identify and return city, state, and zip code of House
            city = result.find('div', class_="property-city").text
            zip_code = city.strip().replace("\n", "")
            City_state = zip_code[:-6]
            # Identify and return room number of House
            bed = result.find('div', class_="property-beds").find('strong').text
            # Identify and return bath number of House
            bath = result.find('div', class_="property-baths").find('strong').text
            # Identify and return half-bath number of House. If half-bath does not exist, it will return 0
            try:    
                half_bath = result.find('div', class_="property-half-baths").find('strong').text
            except:
                half_bath = "0"
                pass
            # Identify and return sqft of House
            sqft = result.find('div', class_="property-sqft").find('strong').text
            # Identify coordinate

            latitude = result.get("data-latitude")
            longitude = result.get("data-longitude")

            # Print results only if title, price, and link are available
            if (listing_type and price and street and city and bed and bath and sqft and latitude and longitude):
                house_obj = {
                "Listing_type": listing_type,
                "Price": price.strip().replace("\n", ""),
                "Street": street.strip().replace("\n", ""),
                "City": City_state[:-3],
                "State": City_state[-2:],
                "Zip_code": zip_code[-5:],
                "Bed": bed.strip().replace("\n", ""),
                "Half-bath": half_bath.strip().replace("\n", ""),
                "Bath": bath.strip().replace("\n", ""),
                "Square_Feet": sqft.strip().replace("\n", ""),
                "House_type": "Single Family",
                "Latitude": latitude,
                "Longitude": longitude
                }

                #Continue to add data into collection house_dict
                house_dict.append(house_obj)

        except AttributeError as e:
            continue
            # print(e)

print(house_dict)

[{'Listing_type': 'FOR SALE', 'Price': '$1,023,000', 'Street': '2959 Reservoir Dr', 'City': 'Simi Valley', 'State': 'CA', 'Zip_code': '93065', 'Bed': '4', 'Half-bath': '1', 'Bath': '3', 'Square_Feet': '3,964', 'House_type': 'Single Family', 'Latitude': '34.288917', 'Longitude': '-118.739461'}, {'Listing_type': 'FOR SALE', 'Price': '$979,000', 'Street': '1038 Allen Street', 'City': 'Petaluma', 'State': 'CA', 'Zip_code': '94954', 'Bed': '5', 'Half-bath': '1', 'Bath': '3', 'Square_Feet': '2,858', 'House_type': 'Single Family', 'Latitude': '38.247959', 'Longitude': '-122.594025'}, {'Listing_type': 'FOR SALE', 'Price': '$777,000', 'Street': '25123 Belmont Ave', 'City': 'Hayward', 'State': 'CA', 'Zip_code': '94542', 'Bed': '4', 'Half-bath': '0', 'Bath': '2', 'Square_Feet': '1,287', 'House_type': 'Single Family', 'Latitude': '37.65842', 'Longitude': '-122.0718'}, {'Listing_type': 'FOR SALE', 'Price': '$659,500', 'Street': '5320 Queens St', 'City': 'Ventura', 'State': 'CA', 'Zip_code': '93003'

In [6]:
house_data = pd.DataFrame(house_dict)
house_data

Unnamed: 0,Bath,Bed,City,Half-bath,House_type,Latitude,Listing_type,Longitude,Price,Square_Feet,State,Street,Zip_code
0,3,4,Simi Valley,1,Single Family,34.288917,FOR SALE,-118.739461,"$1,023,000",3964,CA,2959 Reservoir Dr,93065
1,3,5,Petaluma,1,Single Family,38.247959,FOR SALE,-122.594025,"$979,000",2858,CA,1038 Allen Street,94954
2,2,4,Hayward,0,Single Family,37.65842,FOR SALE,-122.0718,"$777,000",1287,CA,25123 Belmont Ave,94542
3,2,3,Ventura,0,Single Family,34.27213,FOR SALE,-119.22216,"$659,500",1478,CA,5320 Queens St,93003
4,2,3,La Mirada,0,Single Family,33.91568,FOR SALE,-118.0154,"$615,000",1452,CA,14828 Ramhurst Drive,90638
5,2,2,Felton,0,Single Family,37.111,FOR SALE,-122.04515,"$565,000",1324,CA,12376 Coleman Avenue,95018
6,2,3,Los Osos,0,Single Family,35.32908,FOR SALE,-120.82576,"$545,000",1590,CA,1251 16th St.,93402
7,3,4,Cotati,0,Single Family,38.32911,FOR SALE,-122.69142,"$529,900",1703,CA,6575 Santero Way,94931
8,1,3,Los Angeles,0,Single Family,34.0524874162654,FOR SALE,-118.182296363731,"$525,000",1026,CA,3852 Bostwick St,90063
9,1,3,Oxnard,0,Single Family,34.185,FOR SALE,-119.19289,"$459,000",1050,CA,1305 W Cedar St,93033


In [7]:
# Export file as a CSV, without the Pandas index, but with the header
house_data.to_csv("Output/Q2-2018_Single_Family_Data.csv", index=False, header=True)