# Computing Labaratory 
# Nandan Rao 
___

# Scraping Information on the Real Estate Market 
## Submitted by Hannah Busshoff 
## 19.11.2018
___

I scrape approximately 800 pages  on 
__immobilienscout24.de__ to get information on apartements in Cologne. In particular, I gather information on: 
* Rent 
* Number of Rooms
* Size 
* Location
* Pets 
* Bathroom 
* Description of the Apartment 

-1 indicates that the information is missing. 

In [87]:
#Loading required packages
from bs4 import BeautifulSoup
import requests
import queue
import threading 
import numpy as np
import pandas as pd

In [88]:
def getsoup(url): 
    """Defining a function getsoup, which parses url to beautiful soup object."""
    r = requests.get(url)
    html = r.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup

In [89]:
def getnextpage(soup): 
    """Defining a function, which retrieves the next page."""
    core = soup.find("div", {"class" : "grid-item five-twelfths grid-item-fixed-width align-right"}).a.get("href")
    nextpage = "https://www.immobilienscout24.de" + core
    return nextpage

In [116]:
def getApartmentLinks(soup):
    """Defining a function, which gives all Apartment Links"""
    objects = []
    soup = soup.find(id = "resultListItems")
    apartments = soup.find_all("div", {"class": "grid-item result-list-entry__data-container"})
    for apartment in apartments: 
        nextobject_url = "https://www.immobilienscout24.de" + apartment.a.get("href") + "#/"
        objects.append(nextobject_url)
    return objects

In [117]:
def getPageContent(objects, appartment_info):
    """Defining a function, which gives all the information on the partmets on one given page."""
    q = queue.Queue()
    for objects in objects: 
        q.put(objects)
    #Define the Worker.
    def worker():
        while True:
            item = q.get()
            if item is None:
                break
            soup = getsoup(item)
            apartment_info.append(getInfo(soup, [], [], [], [], [], [], []))
            q.task_done()
    threads = []
    #Let the workers do their job.
    for i in range(20):
        t = threading.Thread(target=worker)
        t.start()
        threads.append(t)
    # Block until all tasks are done.
    q.join()
    # Stop workers.
    for i in range(20):
        q.put(None)
    for t in threads:
        t.join()

In [118]:
def getInfo(soup, rent_euros, rooms_number, sqrmeters, location, pets, bathrooms_number, flat_type):
    """Defining values of interest"""
    if soup.find("div", {"class" : "is24qa-kaltmiete is24-value font-semibold"}) is not None: 
        rent_euros.append(soup.find("div", {"class" : "is24qa-kaltmiete is24-value font-semibold"}).get_text()) 
    else:
        rent_euros.append("Missing")
    if soup.find("div", {"class" : "is24qa-zi is24-value font-semibold"}) is not None: 
        rooms_number.append(soup.find("div", {"class" : "is24qa-zi is24-value font-semibold"}).get_text())
    else: rooms_number.append("Missing")
    if soup.find("div", {"class" : "is24qa-flaeche is24-value font-semibold"}) is not None:
        sqrmeters.append(soup.find("div", {"class" : "is24qa-flaeche is24-value font-semibold"}).get_text())
    else: sqrmeters.append("Missing")
    if soup.find("span", {"class" : "zip-region-and-country"})is not None: 
        location.append(soup.find("span", {"class" : "zip-region-and-country"}).get_text())
    else: location.append("Missing")
    if soup.find("dd", {"class" : "is24qa-haustiere grid-item three-fifths"}) is not None: 
         pets.append(soup.find("dd", {"class" : "is24qa-haustiere grid-item three-fifths"}).get_text()) 
    else: pets.append("Missing")
    if soup.find("dd", {"class" : "is24qa-badezimmer grid-item three-fifths"}) is not None: 
        bathrooms_number.append(soup.find("dd", {"class" : "is24qa-badezimmer grid-item three-fifths"}).get_text())
    else: bathrooms_number.append("Missing")
    if soup.find("dd", {"class" : "is24qa-typ grid-item three-fifths"}) is not None: 
        flat_type.append(soup.find("dd", {"class" : "is24qa-typ grid-item three-fifths"}).get_text())  
    else: flat_type.append("Missing")
    apartment_info = rent_euros + rooms_number + sqrmeters + location + pets + bathrooms_number + flat_type 
    return apartment_info

In [119]:
#Initializing an empty list and feeding the first url. 
url = "https://www.immobilienscout24.de/Suche/S-T/Wohnung-Miete/Nordrhein-Westfalen/Koeln?enteredFrom=one_step_search"
apartment_info = []

while True: 
    try: 
        soup = getsoup(url)
        url = getnextpage(soup)
        objects = getApartmentLinks(soup)
        getPageContent(objects, apartment_info)
    except AttributeError: 
       objects = getApartmentLinks(soup)
       getPageContent(objects, apartment_info)
       break

In [120]:
cologne_rents = pd.DataFrame(apartment_info[0:], columns = ["Rent", "Rooms", "Size", "Location", "Pets", "Bathroom", "Flat Type"])
cologne_rents["Rent"] = cologne_rents["Rent"].str.replace(".", "")
cologne_rents["Rent"] = cologne_rents["Rent"].str.replace(",", ".")
cologne_rents[['Rent']] = cologne_rents[['Rent']].replace('[\€,]','',regex=True).astype(float)
cologne_rents[['Rooms']] = cologne_rents['Rooms'].str.replace(",", ".") 
cologne_rents[['Rooms']] = cologne_rents['Rooms'].astype(float)
cologne_rents[['Bathroom']] = cologne_rents['Bathroom'].str.replace(",", ".") 
cologne_rents[['Bathroom']] = cologne_rents['Bathroom'].replace("Missing", "-1") 
cologne_rents[['Bathroom']] = cologne_rents['Bathroom'].astype(float)
cologne_rents[['Size']] = cologne_rents['Size'].str.replace(".", "") 
cologne_rents[['Size']] = cologne_rents['Size'].str.replace(",", ".") 
cologne_rents[['Size']] = cologne_rents['Size'].str.replace("m.",'',regex=True).astype(float)  

In [121]:
cologne_rents

Unnamed: 0,Rent,Rooms,Size,Location,Pets,Bathroom,Flat Type
0,1550.00,3.0,115.00,"50933 Köln, Müngersdorf",Missing,2.0,Etagenwohnung
1,384.56,2.0,61.53,"50765 Köln, Chorweiler",Nach Vereinbarung,1.0,Etagenwohnung
2,1595.00,5.0,145.00,51067 Köln,Missing,2.0,Maisonette
3,1690.00,3.0,122.00,"50933 Köln, Müngersdorf",Missing,-1.0,Etagenwohnung
4,750.00,2.0,60.00,"50667 Köln, Altstadt & Neustadt-Nord",Nein,1.0,Missing
5,1357.00,3.5,120.06,"51149 Köln, Porz",Missing,-1.0,Etagenwohnung
6,1029.00,4.0,97.20,"50858 Köln, Weiden",Nach Vereinbarung,1.0,Etagenwohnung
7,313.00,2.0,41.70,51147 Köln,Missing,1.0,Sonstige
8,1995.00,3.0,155.00,"50997 Köln, Meschenich",Ja,2.0,Maisonette
9,1830.00,2.0,95.00,"50670 Köln, Altstadt & Neustadt-Nord",Missing,1.0,Etagenwohnung


In [122]:
cologne_rents.to_csv("Cologne Rents", sep = ",")