# HomeMatch - Using 'CompreOAlquile.com'

Here we will demonstrate how to use real information from a real website.  We first download the webpages (due to cloudflare block).  And extract relevant information against these webpages to form a pandas dataframe later.

Here we are just doing a webpage scrapping and finally we make a class to do the job

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

Explore only one web page for later extrapolate to all

In [2]:
with open("webpages/Apts1.html","r", encoding='utf-8') as f:
    html_content = f.read()
    soup = BeautifulSoup(html_content)

List all the image links

In [3]:
imgs = soup.find_all('img')
list_imgs = [img.get('src') for img in imgs]
list_imgs = list(filter(lambda x: x.endswith('true'), list_imgs))
list_imgs = list(map(lambda x: x.split('?')[0], list_imgs))
list_imgs = list(map(lambda x: {'links': x}, list_imgs))
pd.json_normalize(list_imgs)

Unnamed: 0,links
0,https://img10.naventcdn.com/avisos/20/00/60/64...
1,https://img10.naventcdn.com/avisos/20/00/63/06...
2,https://img10.naventcdn.com/avisos/20/00/60/64...
3,https://img10.naventcdn.com/avisos/20/00/66/99...
4,https://img10.naventcdn.com/avisos/20/00/90/47...
5,https://img10.naventcdn.com/avisos/20/00/60/35...
6,https://img10.naventcdn.com/avisos/20/00/65/22...
7,https://img10.naventcdn.com/avisos/20/00/54/99...
8,https://img10.naventcdn.com/avisos/20/00/90/79...
9,https://img10.naventcdn.com/avisos/20/00/64/72...


Lists all the descriptions

In [4]:
descriptions = [img.get('alt') for img in imgs]
descriptions = list(filter(lambda x: x is not None, descriptions))
descriptions = list(filter(lambda x: len(x)>20, descriptions))
descriptions = list(map(lambda x: {'description':x}, descriptions))
pd.json_normalize(descriptions)

Unnamed: 0,description
0,"Proyecto de apartamentos , Panamá · Apartament..."
1,"Proyecto de playa , Chame · En Venta Playa Car..."
2,"Proyecto de apartamentos , Panamá · PH Altamir..."
3,Piscina · Nuevo Proyecto en Santa Maria - Proy...
4,"Proyecto de apartamentos , Panamá · Proyecto d..."
5,"Proyecto de apartamentos , Panamá · En Venta T..."
6,"Proyecto de apartamentos , Panamá · Apartament..."
7,"Proyecto de apartamentos , Panamá · Proyecto É..."
8,"Proyecto de apartamentos , Panamá · Proyecto e..."
9,FACHADA · En Venta PH Armonía - Cinta Costera ...


More about the appartment description

In [5]:
anchors = soup.find_all('a')
anchor_texts = [anchor.text for anchor in anchors]
anchor_texts = list(filter(lambda x: len(x) > 40, anchor_texts))
anchor_texts = list(map(lambda x: {'apartment_description': x}, anchor_texts))
pd.json_normalize(anchor_texts)

Unnamed: 0,apartment_description
0,"Planifícate y Vive en ph Aurora, ubicado en el..."
1,La mejor y más completa comunidad de playa a t...
2,Ph Altamira Residences es un edificio de entre...
3,Corotú es lo que faltaba en Santa Maria! Edifi...
4,Mucho más que una ubicación privilegiada. La u...
5,The Reserve está localizado en una de las zona...
6,Últimas 3 unidades en el proyecto más elegante...
7,"Época, un proyecto de apartamentos de 55 a 65 ..."
8,Generation Tower es un concepto divertido e in...
9,Balboa District es un proyecto ambicioso y mod...


Get information about the square meters, room, bathrooms and parking (sometimes there are no parkins that's why the fillna at the end)

In [6]:
info_texts = soup.find_all('h3')
span_texts = [info_text.find_all('span') for info_text in info_texts]
span_texts = list(filter(lambda x: len(x) > 0, span_texts))
span_type = ['sqm', 'room', 'bathroom', 'parking']
span_texts = [[int(span.text.split(' ')[0]) for span in span_text] for span_text in span_texts]
span_texts = list(map(lambda x: dict(zip(span_type, x)), span_texts))
pd.json_normalize(span_texts).fillna(0)

Unnamed: 0,sqm,room,bathroom,parking
0,126,2,0.0,0.0
1,5,2,66.0,0.0
2,6,3,0.0,0.0
3,199,2,99.0,0.0
4,134,1,56.0,0.0
5,4,3,202.0,0.0
6,3,4,0.0,0.0
7,130,2,65.0,0.0
8,384,1,40.0,0.0
9,86,1,45.0,0.0


Extract the location from the webpage

In [7]:
div_list = soup.find_all('h2')
locations_list = [dvlst.text.split(',')[0] for dvlst in div_list]
locations_list = list(map(lambda x: {'neighborhood': x}, locations_list))
pd.json_normalize(locations_list)

Unnamed: 0,neighborhood
0,San Francisco
1,Punta Chame
2,Bella Vista
3,Santa María
4,Avenida Balboa
5,Santa María
6,Costa Del Este
7,Parque Lefevre
8,Costa Del Este
9,Avenida Balboa


Collect the price

In [8]:
info_price = soup.find_all('div')
info_price = [price.text for price in info_price]
info_price = [re.findall(r'(?<!USD\s)(USD\s*\d{1,3}(?:,\d{3})*)(?:\.\d{1,2})?(?!\d)', price) for price in info_price]
info_price = list(filter(lambda x: x is not None, info_price))
info_price = [[price.split('USD')[1].replace(',', '').strip() for price in info] for info in info_price]
info_price = list(filter(lambda x: len(x) > 0, info_price))
info_price = list(filter(lambda x: int(x) > 5000, info_price[0]))
info_price = list(map(lambda x: {'price': int(x)}, info_price))
pd.json_normalize(info_price)

Unnamed: 0,price
0,255000
1,175000
2,255000
3,374595
4,207200
5,706350
6,930800
7,126800
8,178500
9,139534


#### Let's build a class that covers everything above and repeat for all the webpages

Below we will write a python code that do everything above and for each webpage in the directory do one document

In [9]:
%%writefile build_documents.py

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import argparse
import os
import logging
import sys

logging.basicConfig()
logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler(sys.stdout))

class FrameLoader:
    def __init__(self, webdir):
        self.webdir = webdir

    def list_files(self):
        return os.listdir(self.webdir)

    def read_webpage(self, webpage=None):
        path = os.path.join(self.webdir, webpage)
        with open(path, 'r', encoding='utf-8') as f:
            html_content = f.read()
            soup = BeautifulSoup(html_content, features='html.parser')
        return soup

    def build_imglinks(self, soup):
        imgs = soup.find_all('img')
        list_imgs = [img.get('src') for img in imgs]
        list_imgs = list(filter(lambda x: x.endswith('true'), list_imgs))
        list_imgs = list(map(lambda x: x.split('?')[0], list_imgs))
        list_imgs = list(map(lambda x: {'links': x}, list_imgs))
        return pd.json_normalize(list_imgs)        

    def build_descriptions(self, soup):
        imgs = soup.find_all('img')
        descriptions = [img.get('alt') for img in imgs]
        descriptions = list(filter(lambda x: x is not None, descriptions))
        descriptions = list(filter(lambda x: len(x)>20, descriptions))
        descriptions = list(map(lambda x: {'description':x}, descriptions))
        return pd.json_normalize(descriptions)

    def build_home_descriptions(self, soup):
        anchors = soup.find_all('a')
        anchor_texts = [anchor.text for anchor in anchors]
        anchor_texts = list(filter(lambda x: len(x) > 40, anchor_texts))
        anchor_texts = list(map(lambda x: {'apartment_description': x}, anchor_texts))
        return pd.json_normalize(anchor_texts)

    def build_home_locations(self, soup):
        div_list = soup.find_all('h2')
        locations_list = [dvlst.text.split(',')[0] for dvlst in div_list]
        locations_list = list(map(lambda x: {'neighborhood': x}, locations_list))
        return pd.json_normalize(locations_list)        
    
    def build_home_info(self, soup):
        info_texts = soup.find_all('h3')
        span_texts = [info_text.find_all('span') for info_text in info_texts]
        span_texts = list(filter(lambda x: len(x) > 0, span_texts))
        span_type = ['sqm', 'room', 'bathroom', 'parking']
        span_texts = [[int(span.text.split(' ')[0]) for span in span_text] for span_text in span_texts]
        span_texts = list(map(lambda x: dict(zip(span_type, x)), span_texts))
        return pd.json_normalize(span_texts).fillna(0) # sometimes there are no parkings

    def build_home_prices(self, soup):
        info_price = soup.find_all('div')
        info_price = [price.text for price in info_price]
        info_price = [re.findall(r'(?<!USD\s)(USD\s*\d{1,3}(?:,\d{3})*)(?:\.\d{1,2})?(?!\d)', price) for price in info_price]
        info_price = list(filter(lambda x: x is not None, info_price))
        info_price = [[price.split('USD')[1].replace(',', '').strip() for price in info] for info in info_price]
        info_price = list(filter(lambda x: len(x) > 0, info_price))
        info_price = list(filter(lambda x: int(x) > 5000, info_price[0]))
        info_price = list(map(lambda x: {'price': int(x)}, info_price))
        return pd.json_normalize(info_price)

    def compile_and_save(self, *args, path, output_format='csv'):
        if output_format=='xlsx':
            pd.concat([*args], axis=1).to_excel(path, index=False, sheet_name='HomeMatch')
        if output_format=='csv':
            pd.concat([*args], axis=1).to_csv(path, header=True, index=False)
        if output_format=='parquet':
            pd.concat([*args], axis=1).to_parquet(path, index=False)            

    def build_documents(self, webpages, output_folder, output_format):
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        for n, webpage in enumerate(webpages):
            logger.info(f"processing {webpage}")
            path = os.path.join(output_folder, f'document{n:02}.{output_format}')
            
            soup = self.read_webpage(webpage)
            imglinks = self.build_imglinks(soup)
            descriptions = self.build_descriptions(soup)
            home_descriptions = self.build_home_descriptions(soup)
            home_info = self.build_home_info(soup)
            home_locations = self.build_home_locations(soup)
            home_prices = self.build_home_prices(soup)
            self.compile_and_save(imglinks, descriptions, home_descriptions, home_info, home_locations, home_prices, path=path, output_format=output_format)
         
def main(args):
    webdir, output_folder, output_format = args.webdir, args.output_folder, args.output_format
    fl = FrameLoader(webdir)
    webpages = fl.list_files()
    fl.build_documents(webpages=webpages, output_folder=output_folder, output_format=output_format)


if __name__ == "__main__" :
    parser = argparse.ArgumentParser(description='builds a dataframe of the webpages selected')
    parser.add_argument('-w', '--webdir', type=str, default='webpages')
    parser.add_argument('-o', '--output-folder', type=str, default='documents')
    parser.add_argument('-f', '--output-format', type=str, default='csv')

    args = parser.parse_args()
    main(args)

Overwriting build_documents.py


In [10]:
!python build_documents.py -w webpages -o documents -f xlsx