In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Testing
linked = 'https://www.century21.com/real-estate/california/LSCA/'

response = requests.get(linked)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find_all('div', class_='infinite-item')
result

[<div class="infinite-item property-card clearfix property-card-C2180910323" data-brand-cd="C21" data-id="C2180910323" data-latitude="38.02535" data-link="/property/22216-parrotts-ferry-13-sonora-ca-95370-C2180910323" data-listing-id="" data-longitude="-120.40381" data-mls="20181467" data-source-id="56dafcea-766d-461e-97bc-6ee9e5eb9983" data-zip="95370">
 <div class="property-card-clip"> <div class="property-card-image" style="background-image: url(https://www2.century21.com/c21/photo/maxxmax/c21.azureedge.net/308i0/0r6e05264hncmdte4j3a11qnf4i);">
 <div class="property-image-flag newly-listed">Newly Listed</div>
 <div class="property-image-count">
 <div class="image-count-left">&lt;</div>
 <div class="image-count-current">1</div>
 <div class="image-count-total">12</div>
 <div class="image-count-right">&gt;</div>
 </div>
 </div>
 </div>
 <div class="property-card-primary-info">
 <div class="pdp-listing-type sale">FOR SALE</div>
 <a class="listing-price" href="/property/22216-parrotts-fe

In [3]:
#Set up list to hold response info
house_dict = []

In [4]:
# Loop through x pages of the website with filter of Single House
page_link = 'https://www.century21.com/real-estate/california/LSCA/?sn=5&sk=Y&pt=6&p={}'
for link in [page_link.format(page) for page in range(1,50)]:
    res = requests.get(link)
    new_soup = BeautifulSoup(res.text, 'html.parser')
    new_results = new_soup.find_all('div', class_='infinite-item')

    # Loop through returned results
    for result in new_results:
        # Error handling
        try:
            # Identify and return listing type
            listing_type = result.find('div', class_="pdp-listing-type").text
            # Identify and return price of House
            price = result.find('a', class_="listing-price").text
            # Identify and return number and street address of House
            street = result.find('div', class_="property-address").text
            # Identify and return city, state, and zip code of House
            city = result.find('div', class_="property-city").text
            zip_code = city.strip().replace("\n", "")
            City_state = zip_code[:-6]
            # Identify and return room number of House
            bed = result.find('div', class_="property-beds").find('strong').text
            # Identify and return bath number of House
            bath = result.find('div', class_="property-baths").find('strong').text
            # Identify and return half-bath number of House. If half-bath does not exist, it will return 0
            try:    
                half_bath = result.find('div', class_="property-half-baths").find('strong').text
            except:
                half_bath = "0"
                pass
            # Identify and return sqft of House
            sqft = result.find('div', class_="property-sqft").find('strong').text
            # Identify coordinate

            latitude = result.get("data-latitude")
            longitude = result.get("data-longitude")

            # Print results only if title, price, and link are available
            if (listing_type and price and street and city and bed and bath and sqft and latitude and longitude):
                house_obj = {
                "Listing_type": listing_type,
                "Price": price.strip().replace("\n", ""),
                "Street": street.strip().replace("\n", ""),
                "City": City_state[:-3],
                "State": City_state[-2:],
                "Zip_code": zip_code[-5:],
                "Bed": bed.strip().replace("\n", ""),
                "Half-bath": half_bath.strip().replace("\n", ""),
                "Bath": bath.strip().replace("\n", ""),
                "Square_Feet": sqft.strip().replace("\n", ""),
                "House_type": "Co op",
                "Latitude": latitude,
                "Longitude": longitude
                }

                #Continue to add data into collection house_dict
                house_dict.append(house_obj)

        except AttributeError as e:
            continue
            # print(e)

print(house_dict)

[{'Listing_type': 'FOR SALE', 'Price': '$329,000', 'Street': '84-H Calle Aragon', 'City': 'Laguna Woods', 'State': 'CA', 'Zip_code': '92637', 'Bed': '2', 'Half-bath': '0', 'Bath': '2', 'Square_Feet': '1,009', 'House_type': 'Co op', 'Latitude': '33.6049602349011', 'Longitude': '-117.710865510578'}, {'Listing_type': 'FOR SALE', 'Price': '$249,500', 'Street': '389-O Avenida Castilla', 'City': 'Laguna Woods', 'State': 'CA', 'Zip_code': '92637', 'Bed': '2', 'Half-bath': '0', 'Bath': '2', 'Square_Feet': '1,040', 'House_type': 'Co op', 'Latitude': '33.607484967', 'Longitude': '-117.716356003'}, {'Listing_type': 'FOR SALE', 'Price': '$375,000', 'Street': '180-B Avenida Majorca', 'City': 'Laguna Woods', 'State': 'CA', 'Zip_code': '92637', 'Bed': '2', 'Half-bath': '1', 'Bath': '1', 'Square_Feet': '953', 'House_type': 'Co op', 'Latitude': '33.602855704', 'Longitude': '-117.705952577'}, {'Listing_type': 'FOR SALE', 'Price': '$250,000', 'Street': '2282-A Via Mariposa', 'City': 'Laguna Woods', 'Stat

In [5]:
house_data = pd.DataFrame(house_dict)
house_data

Unnamed: 0,Bath,Bed,City,Half-bath,House_type,Latitude,Listing_type,Longitude,Price,Square_Feet,State,Street,Zip_code
0,2,2,Laguna Woods,0,Co op,33.6049602349011,FOR SALE,-117.710865510578,"$329,000",1009,CA,84-H Calle Aragon,92637
1,2,2,Laguna Woods,0,Co op,33.607484967,FOR SALE,-117.716356003,"$249,500",1040,CA,389-O Avenida Castilla,92637
2,1,2,Laguna Woods,1,Co op,33.602855704,FOR SALE,-117.705952577,"$375,000",953,CA,180-B Avenida Majorca,92637
3,2,2,Laguna Woods,0,Co op,33.619265,FOR SALE,-117.720671,"$250,000",1009,CA,2282-A Via Mariposa,92637
4,2,2,Laguna Woods,0,Co op,33.607957098,FOR SALE,-117.714182613,"$220,000",1040,CA,29-O Calle Aragon,92637
5,2,2,Torrance,0,Co op,33.82979,FOR SALE,-118.34198,"$485,000",1300,CA,3110 Merrill Drive 85,90503
6,2,2,Laguna Woods,0,Co op,33.596903069,FOR SALE,-117.704510971,"$240,000",1057,CA,870-Q Avenida Sevilla,92637
7,1,2,Laguna Woods,0,Co op,33.607472182,FOR SALE,-117.717850476,"$273,000",940,CA,308-H Avenida Castilla,92637
8,2,2,Laguna Woods,0,Co op,33.596903069,FOR SALE,-117.704510971,"$339,000",1010,CA,829-A Via Alhambra,92637
9,2,2,Torrance,0,Co op,33.8301750343789,FOR SALE,-118.34265835919,"$485,000",1339,CA,3210 Merrill Dr 40,90503


In [6]:
# Export file as a CSV, without the Pandas index, but with the header
house_data.to_csv("Output/Q2-2018_Co-op_Data.csv", index=False, header=True)