# Data-Oriented Programming Paradigms 2024W: Exercise 2
# Group: 31

### Questions we plan to answer:
1. What features can be used to predict whether a hotel or Airbnb will be successful? (For example, why other hotels are more popular than mine)

Here we can use features of neighbourhood (density of POIs, average price of accommodations, public transport availability) or features of the accommodation (prices, breakfast availability, etc)

2. How can we predict the revenue potential of an Airbnb listing in Vienna based on its location relative to public transport, POIs, and the overall tourism trend in the city? 

3. If I plan to open one more accommodation, based on current Airbnb and hotel performance data, can we predict which neighborhoods in Vienna will see a rise in Airbnb demand in the future?


### Datasets we plan to use:
Transport:
https://www.data.gv.at/katalog/dataset/36a8b9e9-909e-4605-a7ba-686ee3e1b8bf?utm_source=chatgpt.com#resources
https://www.data.gv.at/katalog/dataset/f1f6f15d-2faa-4b62-b78b-80599dd1c66e?utm_source=chatgpt.com#resources

Airbnb:
https://insideairbnb.com/get-the-data/

Overall tourism:
https://ec.europa.eu/eurostat/web/tourism/database

POI:
https://download.geofabrik.de/europe.html


## Load Data

In [11]:
!pip install geopandas requests



In [2]:
pip install pyproj


Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install fiona

Collecting fiona
  Downloading fiona-1.10.1-cp311-cp311-macosx_10_15_x86_64.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m540.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting click-plugins>=1.0 (from fiona)
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting cligj>=0.5 (from fiona)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Downloading fiona-1.10.1-cp311-cp311-macosx_10_15_x86_64.whl (16.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Installing collected packages: cligj, click-plugins, fiona
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.10.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import os
import requests
import scipy
import numpy as np

In [2]:
import requests

url = "https://download.geofabrik.de/europe/austria-latest-free.shp.zip"
response = requests.get(url)
with open("austria-latest-free.shp.zip", "wb") as file:
    file.write(response.content)

import zipfile

with zipfile.ZipFile("austria-latest-free.shp.zip", "r") as zip_ref:
    zip_ref.extractall("austria_shapefiles")

KeyboardInterrupt: 

In [4]:
import geopandas as gpd

pois = gpd.read_file("austria_shapefiles/gis_osm_pois_free_1.shp")

print(pois.head())

     osm_id  code        fclass           name                   geometry
0  15079895  2006     telephone           None  POINT (16.28689 48.19691)
1  15079903  2501   supermarket       Eurospar  POINT (16.28767 48.19697)
2  15080180  2501   supermarket     Billa Plus  POINT (16.29891 48.19776)
3  15080251  2522   sports_shop  Sports Direct  POINT (16.28276 48.19288)
4  17310328  2701  tourist_info           None  POINT (13.49914 47.59028)


  self.crs = crs
  data.array.crs = crs
  level.array.crs = crs


In [5]:
path_airbnb = "Data/Airbnb/"

calendar_path = os.path.join(path_airbnb, "calendar_detailed.csv")
listings_path = os.path.join(path_airbnb, "listings_detailed.csv")
neighbourhoods_path = os.path.join(path_airbnb, "neighbourhoods.csv")
reviews_path = os.path.join(path_airbnb, "reviews_detailed.csv")

calendar_df = pd.read_csv(calendar_path)
listings_df = pd.read_csv(listings_path)
neighbourhoods_df = pd.read_csv(neighbourhoods_path)
reviews_df = pd.read_csv(reviews_path)

path_transport = "Data/Transport/"

stops_path = os.path.join(path_transport, "OEFFHALTESTOGD.csv")
lines_path = os.path.join(path_transport, "OEFFLINIENOGD.csv")

stops_df = pd.read_csv(stops_path)
lines_df = pd.read_csv(lines_path)

path_tourism = "Data/Tourism/"

tour_occ_arm_path = os.path.join(path_tourism, "estat_tour_occ_arm.tsv")
tour_occ_mnor_path = os.path.join(path_tourism, "estat_tour_occ_mnor.tsv")
tour_occ_nim_path = os.path.join(path_tourism, "estat_tour_occ_nim.tsv")

tour_occ_arm_df = pd.read_csv(tour_occ_arm_path, sep='\t')
tour_occ_mnor_df = pd.read_csv(tour_occ_mnor_path, sep='\t')
tour_occ_nim_df = pd.read_csv(tour_occ_nim_path, sep='\t')

## Merging files

In [6]:
combined_df = pd.merge(calendar_df, listings_df, how='inner', left_on='listing_id', right_on='id')

In [7]:
combined_df['date'] = pd.to_datetime(combined_df['date'], errors='coerce')
reviews_df['date'] = pd.to_datetime(reviews_df['date'], errors='coerce')

In [8]:
combined_df = pd.merge(combined_df, reviews_df, how='left', on=['listing_id', 'date'])

Calculate nearest points between accomodation and POI

In [9]:
from sklearn.neighbors import KDTree
import numpy as np
import pandas as pd


In [10]:
# Coordinates combined_df and pois
combined_coords = combined_df[['longitude', 'latitude']].values
poi_coords = np.array([p.coords[0] for p in pois.geometry])

In [11]:
unique_combined_coords = np.unique(combined_coords, axis=0)

In [12]:
# Creating KDTree for POI
tree = KDTree(poi_coords, metric='euclidean')

# Search radius (in degrees; for example ~1 km if latitude/longitude in degrees)
radius = 0.01

# Find all nearest points for each coordinate from combined_df
indices_within_radius = tree.query_radius(unique_combined_coords, r=radius)

In [13]:
coord_to_poi = {}
for coord, indices in zip(unique_combined_coords, indices_within_radius):
    # Store osm_id of nearest POI for each unique coordinate
    coord_to_poi[tuple(coord)] = pois.iloc[indices].osm_id.tolist()

In [14]:
# map closest POIs to each row in combined_df
def map_pois(row):
    coord = (row['longitude'], row['latitude'])
    return coord_to_poi.get(coord, [])

In [15]:
combined_df['nearest_osm_ids'] = combined_df.apply(map_pois, axis=1)

In [16]:
combined_df

Unnamed: 0,listing_id,date,available,price_x,adjusted_price,minimum_nights_x,maximum_nights_x,id_x,listing_url,scrape_id,...,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,id_y,reviewer_id,reviewer_name,comments,nearest_osm_ids
0,275668,2024-09-11,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,3,0,0,1.97,,,,,"[617621809, 7906474310, 1290993459, 1196732025..."
1,275668,2024-09-12,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,3,0,0,1.97,,,,,"[617621809, 7906474310, 1290993459, 1196732025..."
2,275668,2024-09-13,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,3,0,0,1.97,,,,,"[617621809, 7906474310, 1290993459, 1196732025..."
3,275668,2024-09-14,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,3,0,0,1.97,,,,,"[617621809, 7906474310, 1290993459, 1196732025..."
4,275668,2024-09-15,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,3,0,0,1.97,,,,,"[617621809, 7906474310, 1290993459, 1196732025..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254471,1242124355670149881,2025-09-06,f,$999.00,,5,179,1242124355670149881,https://www.airbnb.com/rooms/1242124355670149881,20240911015603,...,134,48,85,1,,,,,,"[11334790048, 7725715981, 7725703605, 87455728..."
5254472,1242124355670149881,2025-09-07,f,$999.00,,5,179,1242124355670149881,https://www.airbnb.com/rooms/1242124355670149881,20240911015603,...,134,48,85,1,,,,,,"[11334790048, 7725715981, 7725703605, 87455728..."
5254473,1242124355670149881,2025-09-08,f,$999.00,,5,179,1242124355670149881,https://www.airbnb.com/rooms/1242124355670149881,20240911015603,...,134,48,85,1,,,,,,"[11334790048, 7725715981, 7725703605, 87455728..."
5254474,1242124355670149881,2025-09-09,f,$999.00,,5,179,1242124355670149881,https://www.airbnb.com/rooms/1242124355670149881,20240911015603,...,134,48,85,1,,,,,,"[11334790048, 7725715981, 7725703605, 87455728..."
