In [20]:
from bs4 import BeautifulSoup
from bs4 import ResultSet
import numpy as np
import pandas as pd
import requests
import re
import matplotlib.pyplot as plt
from scipy import stats

In [21]:
# A dictionary consisting of links from which the data would be gathered
urls = {
    'ogloszenia_wtórny': 'https://ogloszenia.trojmiasto.pl/nieruchomosci-rynek-wtorny/f1i,1_2_3,ri,1_,wi,100_200_230_250_260_220_240_210,xi,1900_.html',
    'ogloszenia_pierwotny': 'https://ogloszenia.trojmiasto.pl/nieruchomosci-rynek-pierwotny/f1i,1_2_3,ri,1_,wi,100_200_230_250_260_220_240_210,xi,1900_.html'
}

In [22]:
# Taking data from the main page and returning Sets of values for each attribute
def take_data_from_main(response):
  soup = BeautifulSoup(response.content, 'html.parser')
  title_list = soup.find_all(class_ ='list__item__content__title__name link')
  district_list = soup.select('p[class="list__item__content__subtitle"]')
  area_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--powierzchnia"] p[class = "list__item__details__icons__element__desc"]')
  number_of_rooms_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--l_pokoi"] p[class = "list__item__details__icons__element__desc"]')
  year_of_creation_list = soup.select('li[class = "list__item__details__icons__element details--icons--element--rok_budowy"] p[class = "list__item__details__icons__element__desc"]')
  price_list = soup.select('p[class = "list__item__details__info details--info--price"]')
  href_list = soup.select('a[class="list__item__content__title__name link"]')
  return title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list

# Diving into the page of an individual advertisement and returning Sets of values for each attribute which was not shown on a main page
def take_data_from_individual(link):
  soup_for_specific_advertisement = BeautifulSoup(requests.get(link).content, 'html.parser')
  type_of_building = soup_for_specific_advertisement.select('div[class="xogField xogField--rodzaj_nieruchomosci"] span[class="xogField__value"]')[0].get_text().strip()
  additional_list = soup_for_specific_advertisement.select('li[class="oglFieldList__item"] b')

  # Adding a try clause because not every offer has an information of the floor (Not given or an offer is a house)
  try:
    floor = soup_for_specific_advertisement.select('a[class="xogField xogField--pietro xogField--withIcon"] span[class="xogField__value xogField__value--big"]')[0].get_text().strip()
  except:
    floor = "nan"
  return type_of_building, additional_list, floor


# Searching through the content of the page to gather necessary data such as:
- title
- area of the estate
- number of rooms
- district and street where estate is placed
- year of creation

In [23]:
#Initializing a list which would save records with advertisements data
data = []

#Searching data for secondary market (rynek wtórny)
url = urls['ogloszenia_wtórny']


"""
Iterating through each page and gathering necessary data such as:
- title
- district
- area
- number of rooms
- year of creation
- price in PLN (for m2)
- link of individual adv (to dive further)

"""
for page in range(0,160):
    print("Page number: " + str(page))
    response = requests.get(url, params={'strona': f"{page}"})
    print(response)
    if response.ok :

        # Using previously defined function for taking data from a main page
        title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list = take_data_from_main(response)

        print(len(title_list))

        # Iterating through lists to create separate records
        for element in range(len(title_list)):
            try:
              price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            except:
              continue
            tytul = title_list[element].get_text().strip().replace("|", " ")
            district = district_list[element].get_text().strip()
            powierzchnia = area_list[element].get_text().strip().rstrip(" m2")
            number_of_rooms = number_of_rooms_list[element].get_text().strip()
            year_of_creation = year_of_creation_list[element].get_text().strip()
            price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            href = href_list[element]
            link = href['href']
            type_of_building, additional_list, floor = take_data_from_individual(link)

            #Converting a Set of values into a list of strings
            additional_list_str = []
            find_string = lambda x : 1 if x.lower() in additional_list_str else 0
            for item in range(len(additional_list)):
              additional_list_str.append(additional_list[item].get_text().strip().lower())

            # Using lambda function to check if an asset for a buyer is in the offer
            garage = find_string('Garaż')
            balcony = find_string('Balkon')
            internet = find_string('Internet')
            parking = find_string('Miejsce Parkingowe')
            elevator = find_string('Winda')
            terrace = find_string('Taras')
            basement = find_string('Piwnica')
            garden = find_string('Ogródek')
            #print("Garage: " + str(garage) + " Balcony: " + str(balcony) + " Internet: " + str(internet) + " Parking: " + str(parking) + " Elevator: " + str(elevator) + " Terrace: " + str(terrace) + " Basement: " + str(basement))

            # Creating a record for a specific offer and adding it to a list
            record = f"{tytul}|{district}|{powierzchnia}|{number_of_rooms}|{year_of_creation}|{price}|{type_of_building}|{garage}|{balcony}|{internet}|{parking}|{elevator}|{floor}|{terrace}|{basement}|{garden}|secondary"
            data.append(record)

Page number: 0
<Response [200]>
30
Page number: 1
<Response [200]>
30
Page number: 2
<Response [200]>
30
Page number: 3
<Response [200]>
30
Page number: 4
<Response [200]>
30
Page number: 5
<Response [200]>
30
Page number: 6
<Response [200]>
30
Page number: 7
<Response [200]>
30
Page number: 8
<Response [200]>
30
Page number: 9
<Response [200]>
30
Page number: 10
<Response [200]>
30
Page number: 11
<Response [200]>
30
Page number: 12
<Response [200]>
30
Page number: 13
<Response [200]>
30
Page number: 14
<Response [200]>
30
Page number: 15
<Response [200]>
30
Page number: 16
<Response [200]>
30
Page number: 17
<Response [200]>
30
Page number: 18
<Response [200]>
30
Page number: 19
<Response [200]>
30
Page number: 20
<Response [200]>
30
Page number: 21
<Response [200]>
30
Page number: 22
<Response [200]>
30
Page number: 23
<Response [200]>
30
Page number: 24
<Response [200]>
30
Page number: 25
<Response [200]>
30
Page number: 26
<Response [200]>
30
Page number: 27
<Response [200]>
30
Pa

In [24]:
# Searching data for primary market (rynek pierwotny)
url = urls['ogloszenia_pierwotny']


"""
Iterating through each page and gathering necessary data such as:
- title
- district
- area
- number of rooms
- year of creation
- price in PLN (for m2)
- link of individual adv (to dive further)

"""
for page in range(0,110):
    print("Page number: " + str(page))
    response = requests.get(url, params={'strona': f"{page}"})
    print(response)
    if response.ok :

        # Using previously defined function for taking data from a main page
        title_list, district_list, area_list, number_of_rooms_list, year_of_creation_list, price_list, href_list = take_data_from_main(response)

        print(len(title_list))

        # Iterating through lists to create separate records
        for element in range(len(title_list)):
            try:
              price = price_list[element].get_text().replace(" ", "").rstrip("zł/m2")
            except:
              continue
            tytul = title_list[element].get_text().strip().replace("|", " ")
            powierzchnia = area_list[element].get_text().strip().rstrip(" m2")
            number_of_rooms = number_of_rooms_list[element].get_text().strip()
            year_of_creation = year_of_creation_list[element].get_text().strip()
            href = href_list[element]
            link = href['href']
            type_of_building, additional_list, floor = take_data_from_individual(link)

            #Converting a Set of values into a list of strings
            additional_list_str = []
            find_string = lambda x : 1 if x.lower() in additional_list_str else 0
            for item in range(len(additional_list)):
              additional_list_str.append(additional_list[item].get_text().strip().lower())

            # Using lambda function to check if an asset for a buyer is in the offer
            garage = find_string('Garaż')
            balcony = find_string('Balkon')
            internet = find_string('Internet')
            parking = find_string('Miejsce Parkingowe')
            elevator = find_string('Winda')
            terrace = find_string('Taras')
            basement = find_string('Piwnica')
            garden = find_string('Ogródek')

            # Creating a record for a specific offer and adding it to a list. To separate data from each other the '=' sign is being used
            record = f"{tytul}|{district}|{powierzchnia}|{number_of_rooms}|{year_of_creation}|{price}|{type_of_building}|{garage}|{balcony}|{internet}|{parking}|{elevator}|{floor}|{terrace}|{basement}|{garden}|primary"
            data.append(record)

Page number: 0
<Response [200]>
30
Page number: 1
<Response [200]>
30
Page number: 2
<Response [200]>
30
Page number: 3
<Response [200]>
30
Page number: 4
<Response [200]>
30
Page number: 5
<Response [200]>
30
Page number: 6
<Response [200]>
30
Page number: 7
<Response [200]>
30
Page number: 8
<Response [200]>
30
Page number: 9
<Response [200]>
30
Page number: 10
<Response [200]>
30
Page number: 11
<Response [200]>
30
Page number: 12
<Response [200]>
30
Page number: 13
<Response [200]>
30
Page number: 14
<Response [200]>
30
Page number: 15
<Response [200]>
30
Page number: 16
<Response [200]>
30
Page number: 17
<Response [200]>
30
Page number: 18
<Response [200]>
30
Page number: 19
<Response [200]>
30
Page number: 20
<Response [200]>
30
Page number: 21
<Response [200]>
30
Page number: 22
<Response [200]>
30
Page number: 23
<Response [200]>
30
Page number: 24
<Response [200]>
30
Page number: 25
<Response [200]>
30
Page number: 26
<Response [200]>
30
Page number: 27
<Response [200]>
30
Pa

In [31]:
df = pd.DataFrame(data)

In [32]:
print(df)

                                                      0
0     3pok 50m2 taras 10m2 blisko morza IIQ24 SE|Gda...
1     Wysoki standard   4 pok   Oliwa   kameralne|Gd...
2     Piękne 3 pokoje po remoncie, przy plaży|Gdańsk...
3     Widok na Jar Wilanowski - blok przy samym Park...
4     Energooszczędne Mieszkanie 75m2 - 4 pokoje|Gda...
...                                                 ...
8002  INPRO S.A. - OPTIMA - mieszkanie 3-pok. 56.33 ...
8003  INPRO S.A. - OPTIMA - mieszkanie 3-pok. 61.44 ...
8004  INPRO S.A. - OPTIMA - mieszkanie 2-pok. 47.22 ...
8005  INPRO S.A. - OPTIMA - mieszkanie 2-pok. 46.30 ...
8006  INPRO S.A. - OPTIMA - mieszkanie 3-pok. 65.17 ...

[8007 rows x 1 columns]


In [38]:
columns = ["title", "district", "area", "number_of_rooms", "year_of_creation", 'price_for_m2_in_PLN', 'type_of_building', 'garage', 'balcony', 'internet', 'parking', 'elevator', 'floor', 'terrace', 'basement', 'garden','type_of_market']
correct_df = df[0].str.split('|', expand=True)
correct_df.columns = columns
display(correct_df)
correct_df.to_csv("ogloszenia.csv", sep = '|')

Unnamed: 0,title,district,area,number_of_rooms,year_of_creation,price_for_m2_in_PLN,type_of_building,garage,balcony,internet,parking,elevator,floor,terrace,basement,garden,type_of_market
0,3pok 50m2 taras 10m2 blisko morza IIQ24 SE,Gdańsk Letnica,59,3,2024,12831,Mieszkanie,1,1,0,0,1,1,1,1,0,secondary
1,Wysoki standard 4 pok Oliwa kameralne,"Gdańsk Oliwa, Opacka",99,4,2020,2424,Mieszkanie,0,1,0,1,1,3,0,0,0,secondary
2,"Piękne 3 pokoje po remoncie, przy plaży","Gdańsk Jelitkowo, ul. Tysiąclecia",41.07,3,1970,19455,Mieszkanie,0,0,0,0,0,4,0,0,0,secondary
3,Widok na Jar Wilanowski - blok przy samym Parku,"Gdańsk Chełm, Prof. Romualda Cebertowicza",63.1,3,1989,9984,Mieszkanie,0,1,1,0,0,4,0,1,0,secondary
4,Energooszczędne Mieszkanie 75m2 - 4 pokoje,Gdańsk,75,4,2024,5187,Mieszkanie,0,0,0,0,0,Parter,0,0,0,secondary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8002,INPRO S.A. - OPTIMA - mieszkanie 3-pok. 56.33 m2,"Gdynia Dąbrowa, Serdecznikowa",56.33,3,2024,10368,Mieszkanie,1,1,1,1,0,2,0,1,0,primary
8003,INPRO S.A. - OPTIMA - mieszkanie 3-pok. 61.44 m2,"Gdynia Dąbrowa, Serdecznikowa",61.44,3,2024,10368,Mieszkanie,1,1,1,1,0,3,0,1,0,primary
8004,INPRO S.A. - OPTIMA - mieszkanie 2-pok. 47.22 ...,"Gdynia Dąbrowa, Serdecznikowa",47.,2,2025,10584,Mieszkanie,1,1,1,1,0,1,0,1,0,primary
8005,INPRO S.A. - OPTIMA - mieszkanie 2-pok. 46.30 ...,"Gdynia Dąbrowa, Serdecznikowa",46.3,2,2025,10584,Mieszkanie,1,1,1,1,0,1,0,1,0,primary


In [39]:
correct_df['area'].replace('', 0.0, inplace = True)
correct_df['price_for_m2_in_PLN'].replace('', 0.0, inplace = True)
correct_df[['area', 'price_for_m2_in_PLN']] = correct_df[['area', 'price_for_m2_in_PLN']].astype(float)
correct_df[['number_of_rooms', 'year_of_creation']] = correct_df[['number_of_rooms', 'year_of_creation']].astype(int)
correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']] = correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']].astype(int)
correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']] = correct_df[['garage', 'balcony', 'internet', 'parking', 'elevator', 'terrace', 'basement', 'garden']].astype(bool)
correct_df['floor'].replace('nan', np.nan, inplace = True)
correct_df['floor'].replace("Parter", "0", inplace = True)
correct_df['area'].replace(0.0, None ,inplace = True)
correct_df['price_for_m2_in_PLN'].replace(0.0, None ,inplace = True)
print(correct_df.dtypes)

title                  object
district               object
area                   object
number_of_rooms         int32
year_of_creation        int32
price_for_m2_in_PLN    object
type_of_building       object
garage                   bool
balcony                  bool
internet                 bool
parking                  bool
elevator                 bool
floor                  object
terrace                  bool
basement                 bool
garden                   bool
type_of_market         object
dtype: object


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  correct_df['area'].replace('', 0.0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  correct_df['price_for_m2_in_PLN'].replace('', 0.0, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on wh

In [35]:
np.where(pd.isnull(correct_df['price_for_m2_in_PLN']))

(array([2854, 4584], dtype=int64),)

In [36]:
print(df.loc[[662]])

                                                     0
662  2 pokoje   słoneczne mieszkanie   GARAŻ|Gdańsk...


In [41]:
data_from_file = pd.read_csv("ogloszenia.csv", sep='|')
data_from_file.columns.values[0] = 'ID'
display(data_from_file)

Unnamed: 0,ID,title,district,area,number_of_rooms,year_of_creation,price_for_m2_in_PLN,type_of_building,garage,balcony,internet,parking,elevator,floor,terrace,basement,garden,type_of_market
0,0,3pok 50m2 taras 10m2 blisko morza IIQ24 SE,Gdańsk Letnica,59.00,3,2024,12831.0,Mieszkanie,1,1,0,0,1,1,1,1,0,secondary
1,1,Wysoki standard 4 pok Oliwa kameralne,"Gdańsk Oliwa, Opacka",99.00,4,2020,2424.0,Mieszkanie,0,1,0,1,1,3,0,0,0,secondary
2,2,"Piękne 3 pokoje po remoncie, przy plaży","Gdańsk Jelitkowo, ul. Tysiąclecia",41.07,3,1970,19455.0,Mieszkanie,0,0,0,0,0,4,0,0,0,secondary
3,3,Widok na Jar Wilanowski - blok przy samym Parku,"Gdańsk Chełm, Prof. Romualda Cebertowicza",63.10,3,1989,9984.0,Mieszkanie,0,1,1,0,0,4,0,1,0,secondary
4,4,Energooszczędne Mieszkanie 75m2 - 4 pokoje,Gdańsk,75.00,4,2024,5187.0,Mieszkanie,0,0,0,0,0,Parter,0,0,0,secondary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8002,8002,INPRO S.A. - OPTIMA - mieszkanie 3-pok. 56.33 m2,"Gdynia Dąbrowa, Serdecznikowa",56.33,3,2024,10368.0,Mieszkanie,1,1,1,1,0,2,0,1,0,primary
8003,8003,INPRO S.A. - OPTIMA - mieszkanie 3-pok. 61.44 m2,"Gdynia Dąbrowa, Serdecznikowa",61.44,3,2024,10368.0,Mieszkanie,1,1,1,1,0,3,0,1,0,primary
8004,8004,INPRO S.A. - OPTIMA - mieszkanie 2-pok. 47.22 ...,"Gdynia Dąbrowa, Serdecznikowa",47.00,2,2025,10584.0,Mieszkanie,1,1,1,1,0,1,0,1,0,primary
8005,8005,INPRO S.A. - OPTIMA - mieszkanie 2-pok. 46.30 ...,"Gdynia Dąbrowa, Serdecznikowa",46.30,2,2025,10584.0,Mieszkanie,1,1,1,1,0,1,0,1,0,primary


In [45]:
data_from_file['type_of_building'].unique()

array(['Mieszkanie', 'Dom bliźniak', 'Dom wolnostojący', 'Dom kamienica',
       'Dom szeregowy', 'Dom rekreacyjny', 'Dom piętro domu'],
      dtype=object)