In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
links_data = pd.read_csv('houses_link.csv')
list_urls = links_data['house link'].tolist()
print(list_urls[:2])

['/bari/bello/proyecto-nuevo-det-2713018.aspx', '/apartamento-en-venta/bello/paris-det-4629208.aspx']


In [3]:
main_url = 'https://www.fincaraiz.com.co'
list_urls = [f'{main_url}{url}' for url in list_urls]
list_urls[:2]

['https://www.fincaraiz.com.co/bari/bello/proyecto-nuevo-det-2713018.aspx',
 'https://www.fincaraiz.com.co/apartamento-en-venta/bello/paris-det-4629208.aspx']

In [19]:
def get_title_and_price(soup):
    try:
        detail = soup.find('div', attrs={'class':'detailheader'})
    
        title = detail.find('div', attrs={'class', 'title'})
        title_box = title.find('div', attrs={'class', 'box'})
        title_text = title_box.find('h1').text
    
        price = detail.find('div', attrs={'class', 'price'})
        price_text = price.find('h2').text
        
        return title_text, price_text
    except Exception as e:
        pass

In [18]:
def get_house_features(soup):
    try:
        features = soup.find('div', attrs={'class': 'features'})
        surface = features.find('span', attrs={'class': 'advertSurface'}).text.strip()
        rooms = features.find('span', attrs={'class': 'advertRooms'}).text.strip()
        baths = features.find('span', attrs={'class': 'advertBaths'}).text.strip()
        garages = features.find('span', attrs={'class': 'advertGarages'}).text.strip()
        
        return clean_features_text(surface, rooms, baths, garages)        
    except Exception as e:
        pass

In [20]:
def get_house_features_2(soup):
    try:
        features_2 = soup.find('div', attrs={'class': 'features_2'})
        features = features_2.find_all('li')
        features = features[:-1]
    
        features_2 = dict()
        for feature in features:
            feature_key = feature.find('b').text
            feature_value = feature.text
            features_2.update({feature_key: feature_value})
        
        return clean_features_2(features_2)
    except Exception as e:
        pass

In [21]:
def clean_features_text(surface, rooms, baths, garages):
    rooms = rooms[-1:]
    baths = baths[-1:]
    garages = garages[-1:] if garages != 'Sin especificar' else 'nan'
    
    return surface, rooms, baths, garages

In [22]:
def clean_features_2(features):
    private_area = features.get('Área privada:')
    builded_area = features.get('Área Const.:')
    estrato = features.get('Estrato:')
    antiquity = features.get('Antigüedad:')
    price_by_m = features.get('Precio m²:')
    administration = features.get('Admón:')
    
    private_area = delete_dirty_chars(private_area).split(':')[1] if private_area else 'nan'
    builded_area = delete_dirty_chars(builded_area).split(':')[1] if builded_area else 'nan'
    estrato = delete_dirty_chars(estrato).split(':')[1][0] if estrato else 'nan'
    antiquity = delete_dirty_chars(antiquity).split(':')[1] if antiquity else 'nan'
    price_by_m = delete_dirty_chars(price_by_m).split(':')[1] if price_by_m else 'nan'
    administration = delete_dirty_chars(administration).split(':')[1][0:8] if administration else 'nan'
    
    return (private_area, builded_area, estrato, antiquity, price_by_m, administration)    

In [23]:
def delete_dirty_chars(string):
    return string.replace(' ', '').replace('\n', '').replace('\r', '')

In [24]:
all_data = []
for index, url in enumerate(list_urls):
    try:
        r = requests.get(url)
        if r.status_code == 404:
            continue
        soup = BeautifulSoup(r.text, 'html.parser')
    
        headers = get_title_and_price(soup)
        features = get_house_features(soup)
        features_2 = get_house_features_2(soup)
    
        house = headers + features + features_2
        all_data.append(house)
    
    except Exception as e:
        continue
    
    print('---------------------------------------------------------')
    print(f'{index}, { url}')
    print('---------------------------------------------------------')
    print(house)
    print('---------------------------------------------------------')

---------------------------------------------------------
0, https://www.fincaraiz.com.co/bari/bello/proyecto-nuevo-det-2713018.aspx
---------------------------------------------------------
('BARI', 'Desde $ 179.808.000 Hasta $ 189.108.000', '64,11 m²', '3', '2', 'nan', '60,90m²', '64,11m²', '3', 'nan', 'nan', 'nan')
---------------------------------------------------------
---------------------------------------------------------
1, https://www.fincaraiz.com.co/apartamento-en-venta/bello/paris-det-4629208.aspx
---------------------------------------------------------
('Apartamento en Venta Bello paris', '$ 83.000.000', '39,00 m²', '2', '1', 'nan', 'nan', '39,00m²', '2', 'Menosde1año', '2.128.205/m²', 'nan')
---------------------------------------------------------
---------------------------------------------------------
2, https://www.fincaraiz.com.co/apartamento-en-venta/bello/cabanas-det-4688002.aspx
---------------------------------------------------------
('Apartamento en Venta 

In [25]:
columns = ['title', 'price', 'surface', 'rooms', 'baths', 'garages', 'private area',
           'builded area', 'estrato', 'antiquity', 'price m2', 'administration']
data_frame = pd.DataFrame(all_data, columns=columns)

In [26]:
data_frame.to_csv('bello_houses.csv', index=False, encoding='utf-8')