## ArgenProp Scrapping

**Objetivo:** extraer información de departamentos en alquiler en Capital Federal de la página de [ArgenProp](https://www.argenprop.com/) para su posterior carga a Redshift y análisis.

* El script sólo busca departamentos ordenados por más recientemente publicados. Como pasos adicionales, puede editarse la URL para extraer casas, PH, etc.
* También podrán extraerse propiedades en venta. Esto ya se tuvo en cuenta en la estructura al agregar una variable "sell_or_rent_ind" para identificar si se trata de un anuncio de alquiler o de venta

In [112]:
import re
import requests
import pytz
from bs4 import BeautifulSoup
from datetime import datetime

utc_timezone = pytz.utc
process_dt_utc = datetime.now(utc_timezone)
process_dttm = process_dt_utc.strftime("%Y-%m-%d %H:%M:%S %Z")
process_dt = process_dt_utc.strftime("%Y-%m-%d")

ap_base_url = 'https://www.argenprop.com'
ap_new_url = ap_base_url + '/departamento-alquiler-localidad-capital-federal-orden-masnuevos-pagina-'

def ap_request(url):
  response = requests.get(url)

  if response.status_code == 200:
      # Parse HTML content using lxml
      soup = BeautifulSoup(response.text, 'lxml')
  else:
      print("Error al obtener la página:", response.status_code)

  return soup

listing_urls = []

In [113]:
# Get the URLs of the most recent listings

for i in range(1,25):

  try:
    page_data = ap_request(ap_new_url + str(i))

    listing_items_temp = page_data.find_all(class_='listing__item')

    for i in listing_items_temp:
      card_link = i.find('a', class_='card')

      # Look for the listing URL inside the "href" atribute
      if card_link and 'href' in card_link.attrs:
        listing_endpoint = card_link['href']

        listing_url = ap_base_url + listing_endpoint

        listing_urls.append(listing_url)
  except Exception as e:
    print('Hubo un problema al procesar la página: ' + e)

In [114]:
# Create an empty list to store property data and iterate over a list of property listing URLs
listings_data = []

for listing_url in listing_urls:

  try:
    listing_req = ap_request(listing_url)

    # Some of the details of the property will always be there, but others may not be present.
    title_address = listing_req.find(class_='titlebar__address').get_text(strip=True)
    title_desc_short = listing_req.find(class_='titlebar__title').get_text(strip=True)
    description_title = listing_req.find(class_='section-description--title').get_text(strip=True)
    listing_price = listing_req.find(class_='titlebar__price').get_text(strip=True)

    try:
      listing_id = listing_url.split("--")[-1]
    except:
      continue
    try:
      description_content = listing_req.find(class_='section-description--content').get_text(strip=True)
    except:
      description_content = None
    try:
      address_detail = listing_req.find(class_='location-container').find_all('p')[0].get_text(strip = True)
    except:
      address_detail = None
    try:
      address_zone = listing_req.find(class_='location-container').find_all('p')[1].get_text(strip = True)
    except:
      address_zone = None

    # Extract all property features and keep the most relevant ones
    features_raw = listing_req.find_all(class_='property-features')
    features_dict = {}

    for features in features_raw:
      features_list = features.find_all('li')

      for feature in features_list:
        string = feature.get_text(strip = True)

        try:
          key, value = string.split(":")
          features_dict[key.strip()] = value.strip()
        except:
          # If splitting by ":" fails, assume it's a boolean feature and set it to True
          features_dict[string] = True

    listing_data_temp = {
        'listing_url': listing_url,
        'listing_id': listing_id,
        'title_address': title_address,
        'title_desc_short': title_desc_short,
        'description_title': description_title,
        'description_content': description_content,
        'listing_price': listing_price,
        'address_detail': address_detail,
        'address_zone': address_zone,
        'room_qty': features_dict.get('Cant. Ambientes', None),
        'dorms_qty': features_dict.get('Cant. Dormitorios', None),
        'baths_qty': features_dict.get('Cant. Baños', None),
        'parking_qty': features_dict.get('Cant. Cocheras', None),
        'property_conditions': features_dict.get('Estado', None),
        'building_conditions': features_dict.get('Estado Edificio', None),
        'property_age': features_dict.get('Antiguedad', None),
        'sell_or_rent_ind': features_dict.get('Tipo de operación', None),
        'unit_type': features_dict.get('Tipo de Unidad', None),
        'area_built': features_dict.get('Sup. Cubierta', None),
        'area_not_built': features_dict.get('Sup. Descubierta', None),
        'expenses_amt': features_dict.get('Expensas', None),
        'price_amt': features_dict.get('Precio', None),
        'elevator': features_dict.get('Ascensor', False),
        'pets': features_dict.get('Permite Mascotas', False),
        'gym': features_dict.get('Gimnasio', False),
        'rooftop': features_dict.get('Terraza', False),
        'pool': features_dict.get('Pileta', False),
        'grill': features_dict.get('Parrilla', False),
        'solarium':  features_dict.get('Solarium', False),
        'process_dt': process_dt,
        'process_dttm': process_dttm
    }

    listings_data.append(listing_data_temp)

  except Exception as e:
    continue

Error al obtener la página: 404
Error al obtener la página: 500


In [141]:
# import copy

# listings_data_copy = copy.deepcopy(listings_data)

In [144]:
# Edit the format of some fields

for listing in listings_data:

  # Convert strings to int
  listing['room_qty'] = int(listing['room_qty']) if listing['room_qty'] is not None else None
  listing['dorms_qty'] = int(listing['dorms_qty']) if listing['dorms_qty'] is not None else None
  listing['baths_qty'] = int(listing['baths_qty']) if listing['baths_qty'] is not None else None
  listing['parking_qty'] = int(listing['parking_qty']) if listing['parking_qty'] is not None else None
  listing['property_age'] = int(listing['property_age']) if listing['property_age'] is not None else None

  # Convert areas to int
  for area in ['area_built','area_not_built']:
    area_units_str = area + '_units'

    if listing[area] is None:
      listing[area] = None
      listing[area_units_str] = None

    else:
      try:
        area_temp = re.match(r'([\d,.]+)\s*(\S+)', listing[area])

        if area_temp:
          numeric_value_str, unit_of_measure = area_temp.groups()
          numeric_value = int(float(numeric_value_str.replace('.', '').replace(',', '.')))

          listing[area] = numeric_value
          listing[area_units_str] = unit_of_measure

        else:
          # No data in 'area_built'
          listing[area] = None
          listing[area_units_str] = None
      except:
        print(f'Error getting the area: {area}')
        print(listing)
        print('')
        break

  # Edit Price (if informed)
  if listing['price_amt'] is None or listing['listing_price'] == 'Consultar precio':
    listing['informs_price_ind'] = False
    listing['price_amt_units'] = None

  else:
    try:
      price_temp = re.match(r'([^\d]+)(\d+(?:[,.]\d+)?)', listing['price_amt'])
    except:
      print('Error getting the price')
      print(listing)
      print('')

    if price_temp:
      price_units, price_amt_str = price_temp.groups()
      price_amt = int(re.sub(r'[,.]', '', price_amt_str))
      price_units = price_units.strip().replace('$','ARS')

      listing['informs_price_ind'] = True
      listing['price_amt'] = price_amt
      listing['price_amt_units'] = price_units

    else:
      listing['informs_price_ind'] = False
      listing['price_amt'] = None
      listing['price_amt_units'] = None

  # Something similar for expenses (if informed)
  if listing['expenses_amt'] is None:
    listing['expenses_ind'] = False
    listing['expenses_amt_units'] = None

  else:
    try:
      expenses_temp = re.match(r'([^\d]+)(\d+(?:[,.]\d+)?)', listing['expenses_amt'])
    except:
      print('Error getting the expenses')
      print(listing)
      print('')

    if expenses_temp:
      expenses_units, expenses_amt_str = expenses_temp.groups()
      expenses_amt = int(re.sub(r'[,.]', '', expenses_amt_str))
      expenses_units = expenses_units.strip().replace('$','ARS')

      listing['expenses_ind'] = True
      listing['expenses_amt'] = expenses_amt
      listing['expenses_amt_units'] = expenses_units

    else:
      listing['expenses_ind'] = False
      listing['expenses_amt'] = None
      listing['expenses_amt_units'] = None

In [146]:
listings_data[0]

{'listing_url': 'https://www.argenprop.com/departamento-en-alquiler-en-belgrano-2-ambientes--13608575',
 'listing_id': '13608575',
 'title_address': 'Blanco Encalada  2300',
 'title_desc_short': 'Departamento en Alquiler en Belgrano, Capital Federal',
 'description_title': 'Depto 2 amb con balcon al frente a metros av. Cabildo',
 'description_content': 'Excelente departamento de 2 ambientes apto profesional, a metros de avenida Cabildo, subte D, en plena zona comercial, en el barrio de Belgrano, uno de los más tradicionales de la ciudad de Buenos Aires y uno de los centros comerciales más importantes de la ciudad.Unidad luminosa con balcon corrido al frente, cocina con lavadero incorporado. Cuenta con habitacion con gran placard y baño completo.Refrigeracion por aire acondicionado Split en living.El contrato es comercial con ajuste semestral por ICL.Ingresos demostrables que puedan triplicar el valor de alquiler.Toda la información y las medidas proveídas son sólo estimativas y deberán