In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Testing
linked = 'https://www.century21.com/real-estate/california/LSCA/'

response = requests.get(linked)
soup = BeautifulSoup(response.text, 'html.parser')
result = soup.find_all('div', class_='infinite-item')
result

[<div class="infinite-item property-card clearfix property-card-C2180910323" data-brand-cd="C21" data-id="C2180910323" data-latitude="38.02535" data-link="/property/22216-parrotts-ferry-13-sonora-ca-95370-C2180910323" data-listing-id="" data-longitude="-120.40381" data-mls="20181467" data-source-id="56dafcea-766d-461e-97bc-6ee9e5eb9983" data-zip="95370">
 <div class="property-card-clip"> <div class="property-card-image" style="background-image: url(https://www2.century21.com/c21/photo/maxxmax/c21.azureedge.net/308i0/0r6e05264hncmdte4j3a11qnf4i);">
 <div class="property-image-flag newly-listed">Newly Listed</div>
 <div class="property-image-count">
 <div class="image-count-left">&lt;</div>
 <div class="image-count-current">1</div>
 <div class="image-count-total">12</div>
 <div class="image-count-right">&gt;</div>
 </div>
 </div>
 </div>
 <div class="property-card-primary-info">
 <div class="pdp-listing-type sale">FOR SALE</div>
 <a class="listing-price" href="/property/22216-parrotts-fe

In [3]:
#Set up list to hold response info
house_dict = []

In [4]:
# Loop through x pages of the website with filter of Single House
page_link = 'https://www.century21.com/real-estate/california/LSCA/?sn=5&sk=Y&pt=3&p={}'
for link in [page_link.format(page) for page in range(1,5000)]:
    res = requests.get(link)
    new_soup = BeautifulSoup(res.text, 'html.parser')
    new_results = new_soup.find_all('div', class_='infinite-item')

    # Loop through returned results
    for result in new_results:
        # Error handling
        try:
            # Identify and return listing type
            listing_type = result.find('div', class_="pdp-listing-type").text
            # Identify and return price of House
            price = result.find('a', class_="listing-price").text
            # Identify and return number and street address of House
            street = result.find('div', class_="property-address").text
            # Identify and return city, state, and zip code of House
            city = result.find('div', class_="property-city").text
            zip_code = city.strip().replace("\n", "")
            City_state = zip_code[:-6]
            # Identify and return room number of House
            bed = result.find('div', class_="property-beds").find('strong').text
            # Identify and return bath number of House
            bath = result.find('div', class_="property-baths").find('strong').text
            # Identify and return half-bath number of House. If half-bath does not exist, it will return 0
            try:    
                half_bath = result.find('div', class_="property-half-baths").find('strong').text
            except:
                half_bath = "0"
                pass
            # Identify and return sqft of House
            sqft = result.find('div', class_="property-sqft").find('strong').text
            # Identify coordinate

            latitude = result.get("data-latitude")
            longitude = result.get("data-longitude")

            # Print results only if title, price, and link are available
            if (listing_type and price and street and city and bed and bath and sqft and latitude and longitude):
                house_obj = {
                "Listing_type": listing_type,
                "Price": price.strip().replace("\n", ""),
                "Street": street.strip().replace("\n", ""),
                "City": City_state[:-3],
                "State": City_state[-2:],
                "Zip_code": zip_code[-5:],
                "Bed": bed.strip().replace("\n", ""),
                "Half-bath": half_bath.strip().replace("\n", ""),
                "Bath": bath.strip().replace("\n", ""),
                "Square_Feet": sqft.strip().replace("\n", ""),
                "House_type": "Multi-Family",
                "Latitude": latitude,
                "Longitude": longitude
                }

                #Continue to add data into collection house_dict
                house_dict.append(house_obj)

        except AttributeError as e:
            continue
            # print(e)

print(house_dict)

[{'Listing_type': 'FOR SALE', 'Price': '$825,000', 'Street': '5417 Lavinia Avenue', 'City': 'Lynwood', 'State': 'CA', 'Zip_code': '90262', 'Bed': '8', 'Half-bath': '0', 'Bath': '4', 'Square_Feet': '3,105', 'House_type': 'Multi-Family', 'Latitude': '33.91195', 'Longitude': '-118.18358'}, {'Listing_type': 'FOR SALE', 'Price': 'Ask', 'Street': '405 Rick #I CT', 'City': 'Ridgecrest', 'State': 'CA', 'Zip_code': '93555', 'Bed': '1', 'Half-bath': '0', 'Bath': '1', 'Square_Feet': '600', 'House_type': 'Multi-Family', 'Latitude': '35.6104618', 'Longitude': '-117.6775826'}, {'Listing_type': 'FOR SALE', 'Price': '$340,000', 'Street': '23201 Fondue Court', 'City': 'Tehachapi', 'State': 'CA', 'Zip_code': '93561', 'Bed': '4', 'Half-bath': '0', 'Bath': '2', 'Square_Feet': '2,001', 'House_type': 'Multi-Family', 'Latitude': '35.08922', 'Longitude': '-118.54579'}, {'Listing_type': 'FOR SALE', 'Price': '$850,000', 'Street': '2368 Kenton Court', 'City': 'Santa Rosa', 'State': 'CA', 'Zip_code': '95407', 'Be

In [5]:
house_data = pd.DataFrame(house_dict)
house_data

Unnamed: 0,Bath,Bed,City,Half-bath,House_type,Latitude,Listing_type,Longitude,Price,Square_Feet,State,Street,Zip_code
0,4,8,Lynwood,0,Multi-Family,33.91195,FOR SALE,-118.18358,"$825,000",3105,CA,5417 Lavinia Avenue,90262
1,1,1,Ridgecrest,0,Multi-Family,35.6104618,FOR SALE,-117.6775826,Ask,600,CA,405 Rick #I CT,93555
2,2,4,Tehachapi,0,Multi-Family,35.08922,FOR SALE,-118.54579,"$340,000",2001,CA,23201 Fondue Court,93561
3,1,2,Santa Rosa,0,Multi-Family,38.4166,FOR SALE,-122.7191,"$850,000",3156,CA,2368 Kenton Court,95407
4,4,4,North Hollywood,0,Multi-Family,34.192265,FOR SALE,-118.36837,"$899,000",2345,CA,6701 Cleon Avenue,91606
5,4,8,Los Angeles,0,Multi-Family,34.0518,FOR SALE,-118.199,"$839,900",4780,CA,2657 Dobinson Street,90033
6,3,4,Tujunga,0,Multi-Family,34.249823,FOR SALE,-118.27859,"$575,000",1500,CA,9960 Tujunga Canyon Boulevard,91042
7,4,4,Palm Desert,0,Multi-Family,33.725822,FOR SALE,-116.380447,"$479,000",2264,CA,73605 Catalina Way,92260
8,10,20,Culver City,0,Multi-Family,33.9952,FOR SALE,-118.43473,"$5,300,000",7996,CA,4061 WADE Street,90066
9,14,12,Santa Monica,0,Multi-Family,34.04185,FOR SALE,-118.48271,"$4,800,000",9674,CA,2632 MONTANA Avenue,90403


In [6]:
# Export file as a CSV, without the Pandas index, but with the header
house_data.to_csv("Output/Q2-2018_Multi-Family_Data.csv", index=False, header=True)