# Assignment 2

In [1]:
# Import necessary packages
import tarfile
import os
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import locale

import re 
from pathlib import Path
from tqdm import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Extract HTML files from tar file
file_path = "kungalv_slutpriser.tar.gz"

if file_path.endswith("tar.gz"):
    tar = tarfile.open(file_path, "r:gz")
    tar.extractall()
    tar.close()
elif file_path.endswith("tar"):
    tar = tarfile.open(file_path, "r:")
    tar.extractall()
    tar.close()

In [3]:
# Helper Functions
# Convert Swedish date to datetime object string
def clean_date(date):
    locale.setlocale(locale.LC_ALL, 'sv_SE')
    datetime_object = datetime.strptime(date, '%d %B %Y')
    return datetime_object.strftime("%Y-%m-%d")

In [4]:
# Listify all html files from a given root
def get_all_html_files(root_path):
    return Path(root_path).glob("**/*.html")

# Set root folder path
html_root = Path("./kungalv_slutpriser/") 

# Empty global list to store data
listing_elements = []

# Looping through each HTML file and scraping to create a database of listings
for html_file in tqdm(list(get_all_html_files(html_root)), desc='Pages Done'):
    with open(html_file) as fp:
        soup = BeautifulSoup(fp, 'html.parser')

    # find all listings on the HTML page
    listings = soup.findAll('li', class_ = 'sold-results__normal-hit')
    

    for listing in listings:
        listing_soup = BeautifulSoup(str(listing), 'html.parser')
        # Extract and clean house area information
        area = listing_soup.find('div', {'class' : 'sold-property-listing__subheading sold-property-listing__area'}).text.strip()
        area = re.sub(r'\s', ' ', area.strip())
        arealist = re.findall(r'\d+', area)

        if len(arealist) == 3:
            boarea = int(arealist[0])
            biarea = int(arealist[1])
            rooms = int(arealist[2])
            livingarea = boarea+biarea
        elif len(arealist) == 2:
            livingarea = int(arealist[0])
            rooms = int(arealist[1])

        # Extract and clean date information
        date = listing_soup.find('span', {'class' : 'hcl-label hcl-label--state hcl-label--sold-at'}).text.strip()
        match = re.search(r'\d', date)  # Search for the first number
        date_of_sale = clean_date((date[match.start():]))
        # Extract address information
        address = listing_soup.find('h2', {'class' : 'sold-property-listing__heading qa-selling-price-title hcl-card__title'}).text.strip()
        # Extract and clean location information 
        location = listing_soup.find('div', {'class' : 'sold-property-listing__location'}).text
        location = re.sub(r'\s+', ' ', location.split("VillaVilla")[1].strip())
        # DO WE NEED THIS?
        listingarea = listing_soup.find('div', {'class' : 'sold-property-listing__subheading sold-property-listing__area'}).text.strip()
        # Plot area extraction
        plotarea = listing_soup.find('div', {'class' : 'sold-property-listing__land-area'})
        if plotarea == None:
            plotarea = None
        else:
            plotarea = int("".join(map(str,re.findall(r'\b\d+\b', plotarea.text.strip()))))
        # Closing price extraction and cleaning
        closingprice = listing_soup.find('span',{'class' : 'hcl-text hcl-text--medium'}).text.strip()
        closingprice = int("".join(map(str,re.findall(r'\b\d+\b', closingprice))))
        
        # Append the attributes for each listing to a list
        listing_info = [date_of_sale, address, location, livingarea, rooms, plotarea, closingprice]
        # Append the list of attributes to the overall list of listings
        listing_elements.append(listing_info)    
    
# Creating dataframe with scraped data
df = pd.DataFrame(listing_elements, columns=(('date_of_sale', 'address','location','livingarea','rooms','plotarea','closingprice')))
df


Pages Done: 100%|██████████| 40/40 [00:07<00:00,  5.69it/s]


Unnamed: 0,date_of_sale,address,location,livingarea,rooms,plotarea,closingprice
0,2017-11-23,Sjöhåla 580,"Kovikshamn, Kungälvs kommun",181,5,1068.0,3100000
1,2017-11-18,Galeasgatan 15,"Kungälv, Kungälvs kommun",167,5,610.0,3850000
2,2017-11-17,Västerhöjdsvägen 36,"Kärna, Kungälvs kommun",107,5,258.0,4000000
3,2017-11-16,Gråstensvägen 19,"Kode Halltorp, Kungälvs kommun",94,5,1197.0,3200000
4,2017-11-16,Hägnan 135,"KAREBY, Kungälvs kommun",235,6,104335.0,8800000
...,...,...,...,...,...,...,...
1968,2016-04-14,Olvonvägen 71,"Kungälv - Romelanda, Kungälvs kommun",230,6,2056.0,4725000
1969,2016-04-14,Rättarevägen 16,"Ytterby, Kungälvs kommun",227,6,732.0,4300000
1970,2016-04-12,Ametistvägen 11,"Kode, Kungälvs kommun",107,4,806.0,2850000
1971,2016-04-09,Rådalsvägen 25 Havsutsikt,"Aröd, Kungälvs kommun",88,3,1462.0,2995000


In [5]:
# Export to CSV
df.to_csv("problem1.csv")