# Source A

A notebook for scraping, processing, and storing (to Airtable) the listings in [Source B](https://www.mobilehomeparkstore.com/mobile-home-parks-for-sale).

In [46]:
from bs4 import BeautifulSoup
import requests
import time
import re

In [71]:
main_url = 'https://www.mobilehomeparkstore.com'
url = 'https://www.mobilehomeparkstore.com/mobile-home-parks-for-sale'

In [8]:
response = requests.get(url)
soup = BeautifulSoup(response.text)

In [9]:
# Get links to each state
links_to_state = soup.select('.column-list a')
links_to_state = [main_url + l['href'] for l in links_to_state]
links_to_state

['https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/alabama/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/arizona/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/arkansas/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/california/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/colorado/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/florida/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/georgia/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/idaho/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/illinois/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/indiana/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/iowa/all',
 'https://www.mobilehomeparkstore.com//mobile-home-parks-for-sale/kansas/all',
 'https://www.mobilehomeparkstore.com//m

Get state names.

In [10]:
states = [s.split('/')[-2] for s in links_to_state]

Go further and get sublinks of each listing/item per state.

In [33]:
listings_dct = {}
total_items = 0

for s, lnk in zip(states, links_to_state):
    response_state = requests.get(lnk)
    soup_state = BeautifulSoup(response_state.text)
    
    listings = soup_state.select('div.item-listing div.item')
    listings_dct[s] = [main_url + l.select_one('a')['href'] for l in listings]
    
    total_items += len(listings_dct[s])
    print(f'Processed {s}. Currently stored {total_items} listings ...')
    
    # If 50 listings
    if total_items >= 50:
        break
    
    time.sleep(5)

Processed alabama. Currently stored 11 listings ...
Processed arizona. Currently stored 14 listings ...
Processed arkansas. Currently stored 17 listings ...
Processed california. Currently stored 28 listings ...
Processed colorado. Currently stored 35 listings ...
Processed florida. Currently stored 55 listings ...


Generate `BeautifulSoup` object to each listing.

In [45]:
# Create separate BeautifulSoup object per listing so I don't have to make requests every time I try
listings_soup_dct = {}

for state in listings_dct.keys():
    listing_links = listings_dct[state]
    listings_soup_dct[state] = []
    
    for link in listing_links:
        response_per_listing = requests.get(link)
        listings_soup_dct[state].append(BeautifulSoup(response_per_listing.text))
        
        time.sleep(5)
    
    print(f'Finished extracting for {state}')

Finished extracting for alabama
Finished extracting for arizona
Finished extracting for arkansas
Finished extracting for california
Finished extracting for colorado
Finished extracting for florida


In [None]:
# Park name = h1 text > re.split(r'\n+', text) [p for p in re.split(r'\n+', text) if p]
# Street address = Same above but use second part and just parse further
# City = Get from h1 as well
# State = Get from h1 as well
# ZIP = Get from h1 as well
# Phone number = Go to contact broker/seller link
# Website = https://www.mobilehomeparkstore.com/
# Total lots = 
# Rent range = 
# Pet policy = 
# Amenities = 
# Latitude = google-map-link attrs={'data-test-id': 'view-on-map'} its `a` child -> href -> split 'query=' then split at ',' get the first
# Longitude = google-map-link attrs={'data-test-id': 'view-on-map'} its `a` child -> href -> split 'query=' then split at ',' get the second
# Source website URL = lnk
# Listing Price = N/A

'\nSandra Ball . Asso. Broker, Coldwell Banker Mcmillan\n\n\n8374 County Road 222, Trinity, AL 35673\n\n\n\n\n\n'

In [79]:
def get_name_and_address(soup):
    header = soup.select_one('h1').text
    header_split = [p for p in re.split(r'\n+', header) if p]
    park_name = header_split[0]
    
    address_split = header_split[1].split(', ')
    
    if len(address_split) == 3:
        street_address = address_split[0]
        city = address_split[1]
        state = address_split[2].split(' ')[0]
        zipcode = address_split[2].split(' ')[1]
    elif len(address_split) == 2:
        street_address = 'N/A'
        city = address_split[0]
        state_zipcode_split = address_split[1].split(' ')
        
        if len(state_zipcode_split) == 2:
            state = address_split[1].split(' ')[0]
            zipcode = address_split[1].split(' ')[1]
        elif len(state_zipcode_split) == 1:
            state = address_split[1].split(' ')[0]
            zipcode = 'N/A'
        
    return park_name, street_address, city, state, zipcode

In [108]:
def find_phone_number(soup):
    contact_button = soup.select_one('turbo-frame#contact_main_col_regular')
    
    if contact_button:
        contact_link = main_url + contact_button.select_one('a')['href']
        
        requests_contact = requests.get(contact_link)
        soup_contact = BeautifulSoup(requests_contact.text)
        
        phone = soup_contact.select_one('a#dlg_phone_button').text
        
        phone_clean = re.sub(r'[^\d]', '', phone)
        phone_clean = '(' + phone_clean[0:3] + ')' + phone_clean[3:6] + '-' + phone_clean[6:]
        
        time.sleep(5)
        
        return phone_clean
    
    return 'N/A'

In [55]:
def find_total_lots(soup):
    for div in soup.select('div.col-lg-6'):
        h3 = div.find('h3')
        
        # First, find the div with containing park info
        if h3 and 'Park Information' in h3.get_text(strip=True):
            # Find table row containing number of MH lots
            for row in div.select('tr'):
                if 'Number of MH Lots' in row.get_text(strip=True):
                    lots = row.select('td')[1].text
                    return lots
    
    return 'N/A'

In [59]:
def find_lot_rent(soup):
    for div in soup.select('div.col-lg-6'):
        h3 = div.find('h3')
        
        # First, find the div with containing park info
        if h3 and 'Park Information' in h3.get_text(strip=True):
            # Find table row containing number of MH lots
            for row in div.select('tr'):
                if 'Average MH Lot Rent' in row.get_text(strip=True):
                    lots = row.select('td')[1].text
                    return lots
    
    return 'N/A'

In [69]:
def find_amenities(soup):
    for h3 in soup.select('h3.headline'):
        if 'Amenities' in h3.text:
            amenities_ul = h3.find_next_sibling('ul')
            amenities_bullets = [li.get_text(strip=True) for li in amenities_ul.find_all('li')]
            amenities = '\n'.join(amenities_bullets)
            
            return amenities
    
    return 'N/A'

In [90]:
def find_listing_price(soup):
    for h1 in soup.select('h1'):
        if 'For Sale' in h1.text:
            return re.sub(r'[^\d]', '', h1.text)

In [123]:
listings_details = {}

for state in listings_soup_dct.keys():
    listings_soup_lst = listings_soup_dct[state]
    listings_link_lst = listings_dct[state]
    listings_details[state] = []
    
    for sp, lnk in zip(listings_soup_lst, listings_link_lst):
        # Park name and address
        try:
            park_name, street_address, city, state2, zipcode = get_name_and_address(sp)
        except:
            header = soup.select_one('h1').text
            header_split = [p for p in re.split(r'\n+', header) if p]
            park_name = header_split[0]
        
        # phone number
        phone_number = find_phone_number(sp)
        
        # website
        website = main_url
        
        # Total lots
        total_lots = find_total_lots(sp)
        
        # Rent range
        rent_range = find_lot_rent(sp).strip('$')
        
        # Get pet policies
        pet_policy = 'N/A'
        
        # Get amenities
        amenities = find_amenities(sp)
        
        # Get latitude and longitude
        latitude = 'N/A'
        longitude = 'N/A'
        
        # Source website URL
        source_url = lnk
        
        # Listing price
        listing_price = find_listing_price(sp)
        
        listings_details[state].append(
            {
                'Park Name': park_name,
                'Street Address': street_address,
                'City': city,
                'State': state2,
                'ZIP': zipcode,
                'Phone Number': phone_number,
                'Website': website,
                'Total Lots': total_lots,
                'Rent Range': rent_range,
                'Pet Policy': pet_policy,
                'Amenities': amenities,
                'Latitude': latitude,
                'Longitude': longitude,
                'Source Website URL': source_url,
                'Listing Price': listing_price
            }
        )

In [124]:
listings_details

{'alabama': [{'Park Name': 'Sandra Ball . Asso. Broker, Coldwell Banker Mcmillan',
   'Street Address': '8374 County Road 222',
   'City': 'Trinity',
   'State': 'AL',
   'ZIP': '35673',
   'Phone Number': '(256)642-9903',
   'Website': 'https://www.mobilehomeparkstore.com',
   'Total Lots': 'N/A',
   'Rent Range': 'N/A',
   'Pet Policy': 'N/A',
   'Amenities': 'N/A',
   'Latitude': 'N/A',
   'Longitude': 'N/A',
   'Source Website URL': 'https://www.mobilehomeparkstore.com//mobile-home-parks/6957345-sandra-ball-asso-broker-coldwell-banker-mcmillan-for-sale-in-trinity-al',
   'Listing Price': '1'},
  {'Park Name': 'Royal Palm Estates',
   'Street Address': '1435 County Road 73',
   'City': 'Slocomb',
   'State': 'AL',
   'ZIP': '36375',
   'Phone Number': '(316)655-8405',
   'Website': 'https://www.mobilehomeparkstore.com',
   'Total Lots': '77',
   'Rent Range': '459',
   'Pet Policy': 'N/A',
   'Amenities': 'N/A',
   'Latitude': 'N/A',
   'Longitude': 'N/A',
   'Source Website URL': '

## Pass to Airtable

In [125]:
from pyairtable import Api, Base
from dotenv import load_dotenv
import os

In [126]:
load_dotenv()

True

In [127]:
AIRTABLE_PAT = os.getenv('AIRTABLE_PAT')
BASE_ID = os.getenv('BASE_ID')
TABLE_NAME = 'Listings'

In [128]:
api = Api(AIRTABLE_PAT)
base = Base(api, BASE_ID)
table = base.table(TABLE_NAME)

In [129]:
records = []

for city in listings_details.keys():
    listings_details_lst = listings_details[city]
    
    for listing in listings_details_lst:
        records.append(listing)

In [131]:
for r in records:
    table.create(r)

## Export in Pandas

In [132]:
import pandas as pd

In [133]:
source_B_df = pd.DataFrame(records)

In [134]:
source_B_df.to_csv('source_B_listings_only.csv')