In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
from bs4 import BeautifulSoup

### Auxiliar functions

In [7]:
def generate_soup(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup

In [8]:
def extract_data(soup):
    for item in soup.find_all("article", {'class':'item'}):
        img = item.find("div", {'class':'gallery-boost-first'}).find("img")['src']
        img = img.split('/')[-1]

        img_no = item.find("picture").find_all("span")
        img_no = max([int(i.get_text(strip=True)) for i in img_no if i.get_text(strip=True).isdigit()])

        container_info = item.find("div", {'class':'item-info-container--features'})

        title = container_info.find("a", {"role":"heading"}).get_text(strip=True)

        id = container_info.find("a", {"role":"heading"})["href"].split('/')[-2]

        price = container_info.find("div", {'class':'price-row'})
        price = container_info.find("span", {'class':'item-price'}).get_text(strip=True)
        price_form = price.split('/')[-1]
        price = price.split('/')[0]

        container_details = container_info.find("div", {'class':'item-detail-char'})
        n_rooms = size = floor = None
        extra_info = []
        for d in container_details.find_all("span"):
            d = d.get_text(strip=True).lower()
            if "hab" in d:
                n_rooms = d.split(' ')[0]
            elif "m²" in d:
                size = d.split(' ')[0]
            elif "planta" in d:
                floor = d.split(' ')[0]
            else:
                extra_info.append(d)

        description = item.find("div", {'class':'item-description'}).get_text(strip=True)
        # remove special chars
        description = re.sub('[^A-Za-z0-9]+', ' ', description)

        new_row = pd.DataFrame({'id':id, 'title':title, 'price':price, 'price_form':price_form, 'n_rooms':n_rooms, 'size':size, 'floor':floor, 'description':description, 'extra_info':extra_info, 'image':img, 'img_no':img_no})
        
        return new_row

In [9]:
def scrap_file(data_folder):
    columns = ["id", "title", "n_rooms", "size", "description", "extra_info", "image", "img_no", "price", "price_form"]
    data = pd.DataFrame(columns=columns)

    for filename in os.listdir(data_folder):
        soup = generate_soup(os.path.join(data_folder, filename))

        for item in soup.find_all("article", {'class':'item'}):

            # check if it has an image
            try:
                img = item.find("div", {'class':'item-gallery'}).find("img")['src']
                img = img.split('/')[-1]
                img_no = item.find("picture").find_all("span")
                img_no = max([int(i.get_text(strip=True)) for i in img_no if i.get_text(strip=True).isdigit()])
            except:
                img = None
                img_no = 0

            container_info = item.find("div", {'class':'item-info-container--features'})

            title = container_info.find("a", {"role":"heading"}).get_text(strip=True)

            id = container_info.find("a", {"role":"heading"})["href"].split('/')[-2]

            price = container_info.find("div", {'class':'price-row'})
            price = container_info.find("span", {'class':'item-price'}).get_text(strip=True)
            price_form = price.split('/')[-1]
            price = price.split('/')[0].replace('€', '')
            parking = True
            try:
                container_info.find("span", {'class':'item-parking'}).get_text(strip=True)
            except:
                parking = False
            container_details = container_info.find("div", {'class':'item-detail-char'})
            n_rooms = size = None
            extra_info = ""
            for d in container_details.find_all("span"):
                d = d.get_text(strip=True).lower()
                if "hab" in d:
                    n_rooms = d.split(' ')[0]
                elif "m²" in d:
                    size = d.split(' ')[0]
                else:
                    extra_info = (extra_info + " " + d)
                    extra_info = extra_info.strip()

            if parking:
                extra_info = extra_info + " parking"

            description = item.find("div", {'class':'item-description'}).get_text(strip=True)
            # remove special chars, keep only alphanumeric
            
            description = re.sub('[^A-Za-z0-9áéíóúÁÉÍÓÚüÜñÑ]+', ' ', description)

            new_row = pd.Series([id, title, n_rooms, size, description, extra_info, img, img_no, price, price_form], index=columns)
            data.loc[len(data)] = new_row
            
    return data

### Main code

In [10]:
src = './data_filtered/idealista/'
dest_folder = './data/idealista'
os.makedirs(dest_folder, exist_ok=True)

for folder in os.listdir(src)[1:]:
    folder_path = os.path.join(src, folder)
    data_folder = os.path.join(folder_path, 'html')
    data = scrap_file(data_folder)
    # save data to csv file
    filename = 'viviendas' + '_' + folder + '.csv'
    data.to_csv(os.path.join(dest_folder, filename), index=False)