Combining and adapting Krzystof Jamróz's dataset - I removed his files with rent prices beforehand

In [5]:
import geopandas as gpd
import pandas as pd
from tkinter import filedialog
from pathlib import Path

Automatically concatenating all files from given folder

In [84]:
dir_path=Path(r"data\original_files")

csv_files=list(dir_path.glob("*.csv"))
print(f"found files to process: {len(csv_files)}")

df_list=[]
for i, filepath in enumerate(csv_files, 1):
    print(f"loading file {i}/{len(csv_files)}: {filepath.name} ...")
    try:
        df=pd.read_csv(filepath, index_col=None, header=0)
        df_list.append(df)
    except Exception as e:
        print(f"error loading {filepath.name}: {e}")
if df_list:
    print("concatenating files")
    df_out=pd.concat(df_list, ignore_index=True)
    print(f"final dataframe shape: {df_out.shape}")
else:
    print("couldn't load files")

found files to process: 11
loading file 1/11: apartments_pl_2023_08.csv ...
loading file 2/11: apartments_pl_2023_09.csv ...
loading file 3/11: apartments_pl_2023_10.csv ...
loading file 4/11: apartments_pl_2023_11.csv ...
loading file 5/11: apartments_pl_2023_12.csv ...
loading file 6/11: apartments_pl_2024_01.csv ...
loading file 7/11: apartments_pl_2024_02.csv ...
loading file 8/11: apartments_pl_2024_03.csv ...
loading file 9/11: apartments_pl_2024_04.csv ...
loading file 10/11: apartments_pl_2024_05.csv ...
loading file 11/11: apartments_pl_2024_06.csv ...
concatenating files
final dataframe shape: (195568, 28)


In [85]:
krakow=df_out.loc[df_out["city"]=="krakow"]
display(krakow.info(),krakow.head())

<class 'pandas.core.frame.DataFrame'>
Index: 29026 entries, 1382 to 178786
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    29026 non-null  object 
 1   city                  29026 non-null  object 
 2   type                  24086 non-null  object 
 3   squareMeters          29026 non-null  float64
 4   rooms                 29026 non-null  float64
 5   floor                 22880 non-null  float64
 6   floorCount            28490 non-null  float64
 7   buildYear             24957 non-null  float64
 8   latitude              29026 non-null  float64
 9   longitude             29026 non-null  float64
 10  centreDistance        29026 non-null  float64
 11  poiCount              29026 non-null  float64
 12  schoolDistance        29026 non-null  float64
 13  clinicDistance        28979 non-null  float64
 14  postOfficeDistance    29021 non-null  float64
 15  kindergartenDistance

None

Unnamed: 0,id,city,type,squareMeters,rooms,floor,floorCount,buildYear,latitude,longitude,...,pharmacyDistance,ownership,buildingMaterial,condition,hasParkingSpace,hasBalcony,hasElevator,hasSecurity,hasStorageRoom,price
1382,78c79140abe14c5868d2c1b1e420ecaf,krakow,blockOfFlats,46.0,2.0,,6.0,2017.0,50.010804,19.889806,...,0.246,condominium,,,no,yes,yes,no,yes,695000
1383,9acc41ae517183347f0ee3099463ca1e,krakow,apartmentBuilding,42.0,2.0,2.0,3.0,2017.0,50.013081,19.905746,...,0.586,condominium,brick,,no,yes,yes,yes,no,600000
1384,b50988e796828441e14c2f64f611ec08,krakow,tenement,69.0,3.0,,2.0,,50.050279,19.929073,...,0.219,condominium,brick,,no,yes,no,yes,no,989000
1385,9a0ff6a57d195d235a1e048a20865e1e,krakow,blockOfFlats,73.0,3.0,3.0,4.0,,50.006,20.0086,...,0.223,condominium,,,no,yes,no,no,yes,599000
1386,54015aec90da2f0db72c46e8811fa4ae,krakow,tenement,53.9,2.0,1.0,2.0,,50.0709,19.9362,...,0.111,condominium,brick,,yes,yes,no,no,no,1228900


Krzysztof's dataset combines apartment offers throughout a timespan of 10 months, so I guess there should be some stagnant offers without any buyers that could disturb my results

In [86]:
krakow["id"].duplicated().value_counts()

id
True     14840
False    14186
Name: count, dtype: int64

As expected, around half of the offers from the dataset are just duplicates from previous months. I'll keep only the most recent ones. Since the data was loaded in alphabetical order i can just use keep="last"

In [87]:
krakow=krakow.drop_duplicates(subset="id",keep="last")

And now, for use in my project I don't really need any data except the location of the flat, its price and its surface area

In [88]:
krakow_clean=krakow[["price","latitude","longitude","squareMeters"]].copy()
krakow_clean["pricePerMeter"]=(krakow["price"]/krakow["squareMeters"]).round(2)
krakow_clean=krakow_clean[["latitude","longitude","pricePerMeter"]]
gdf_krakow_clean=gpd.GeoDataFrame(data=krakow_clean,geometry=gpd.points_from_xy(krakow_clean["longitude"],krakow_clean["latitude"]),crs=4326).to_crs(epsg=2180)


Now I can just save the data to the .parquet file

In [None]:
gdf_krakow_clean.to_parquet("data/krakow_flats.parquet")

In [1]:
import osmnx as ox
from pathlib import Path
import geopandas as gpd
import pandas as pd
ox.settings.use_cache = True
ox.settings.log_console = True


city = "Kraków, Poland"

poi_config = {
    "cafe": {"amenity": "cafe"},
    "restaurant": {"amenity": ["restaurant", "bar", "pub"]},
    "convenience": {"shop": "convenience"},
    "supermarket": {"shop": "supermarket"},
    "school": {"amenity": "school"},
    "kindergarten": {"amenity": "kindergarten"},
    "bus_stop": {"highway": "bus_stop"},
    "tram_stop": {"railway": "tram_stop"},
    "pharmacy": {"amenity": "pharmacy"},
    "clinic": {"amenity": "clinic"},
    "industrial": {"landuse": "industrial"},
    "abandoned": {"building": ["ruins", "abandoned"]},
    "liquor_store": {"shop": "alcohol"},
    "community_centre": {"amenity": "community_centre"},
    "playground": {"leisure": "playground"},

    "park": {"leisure": "park", "landuse": "recreation_ground"},
    "forest": {"landuse": "forest", "natural": "wood"},
    "meadow": {"landuse": ["meadow", "village_green"]},
    "grassland": {"natural":"grassland"},
    "water": {"natural": "water", "waterway": "river"},
    "nature_reserve": {"leisure": "nature_reserve", "boundary":"protected_area"}
}


poi_list=[]
nature_list=[]
for category_name, tags in poi_config.items():
    print(f"downloading: {category_name}...")
    try:
        gdf=ox.features_from_place(city, tags=tags)
        if "name" in gdf.columns:
            gdf["name"]=gdf["name"].fillna("Unknown name")
        else:
            gdf["name"]="Unknown name"
        gdf=gdf.reset_index().to_crs(epsg=2180)
        gdf["category"]=category_name
        
        if category_name in ["meadow","park","forest","water","nature_reserve","grassland"]:
            if category_name=="nature_reserve":
                if "protect_class" in gdf.columns:
                    gdf=gdf.loc[gdf["protect_class"].isin(["19","1b","2","3","4","97"])]
            gdf=gdf[["name","category","geometry"]]
            nature_list.append(gdf)
        
        else:
            
            if category_name=="liquor_store": # for Kraków it doesn't matter because there's night prohibition, although for other cities it might lower quality of life
                if "opening_hours" in gdf.columns:
                    gdf=gdf.loc[gdf["opening_hours"]=="24/7"]
                else:
                    gdf=gdf.iloc[0:0]

            gdf["geometry"]=gdf["geometry"].centroid
            gdf=gdf[["name","category","geometry"]]
            poi_list.append(gdf)

    except Exception as e:
        print(f"error loading {category_name}: {e}")

if poi_list:
    krakow_poi_full=pd.concat(poi_list, ignore_index=True)
    krakow_poi_full.to_parquet(Path("data")/"krakow_poi.parquet")
    print(f"pois saved, base size: {krakow_poi_full.shape}")

if nature_list:
    krakow_nature_full=pd.concat(nature_list, ignore_index=True)
    krakow_nature_full.to_parquet(Path("data")/"krakow_nature.parquet")
    print(f"nature saved, base size: {krakow_nature_full.shape}")

downloading: cafe...
downloading: restaurant...
downloading: convenience...
downloading: supermarket...
downloading: school...
downloading: kindergarten...
downloading: bus_stop...
downloading: tram_stop...
downloading: pharmacy...
downloading: clinic...
downloading: industrial...
downloading: abandoned...
downloading: liquor_store...
downloading: community_centre...
downloading: playground...
downloading: park...
downloading: forest...
downloading: meadow...
downloading: grassland...
downloading: water...
downloading: nature_reserve...
pois saved, base size: (7821, 3)
nature saved, base size: (10288, 3)


In [10]:
gpd.read_parquet(Path("data")/"krakow_nature.parquet").head()

Unnamed: 0,name,category,geometry
0,Park Zielone Serce Podgórza,park,POINT (573867.888 242541.604)
1,Unknown name,park,POINT (570816.621 242516.638)
2,Parku Kieszonkowego,park,POINT (565832.711 243552.186)
3,Unknown name,park,POINT (570913.934 242996.942)
4,Park Fort 2 „Kościuszko” imienia Profesora Jan...,park,POINT (564040.836 243584.831)
