### installing libraries

In [1]:
!pip install pandas geopandas matplotlib numpy requests




### importing libraries

In [4]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import requests
import os

from io import StringIO


import zipfile
import geopandas as gpd

import gzip
import shutil

from shapely.geometry import Point
import geopandas as gpd

from sklearn.neighbors import KDTree
import ast

## defining data sources

### function for accessing csv's through http requests

In [5]:
#returns csv as string
def fetch_csv_from_http(url):
    response = requests.get(url)
    if response.status_code == 200:
        csv_data = StringIO(response.text)
    else:
        return "fetch failed"
    return csv_data

### downloading the points of interest dataset

In [4]:
poi_url = "https://download.geofabrik.de/europe/austria-latest-free.shp.zip"
response = requests.get(poi_url)
with open("austria-latest-free.shp.zip", "wb") as file:
    file.write(response.content)

with zipfile.ZipFile("austria-latest-free.shp.zip", "r") as zip_ref:
    zip_ref.extractall("austria_shapefiles")

KeyboardInterrupt: 

In [6]:
pois = gpd.read_file("austria_shapefiles/gis_osm_pois_free_1.shp")

print(pois.head())

     osm_id  code        fclass           name                   geometry
0  15079895  2006     telephone            NaN  POINT (16.28689 48.19691)
1  15079903  2501   supermarket       Eurospar  POINT (16.28767 48.19697)
2  15080180  2501   supermarket     Billa Plus  POINT (16.29891 48.19776)
3  15080251  2522   sports_shop  Sports Direct  POINT (16.28276 48.19288)
4  17310328  2701  tourist_info            NaN  POINT (13.49914 47.59028)


### wienlinien datasets

In [7]:
haltestellen_url = "https://www.wienerlinien.at/ogd_realtime/doku/ogd/wienerlinien-ogd-haltestellen.csv"
haltestellen_data = fetch_csv_from_http(haltestellen_url)
haltestellen_df = pd.read_csv(haltestellen_data, sep=';')

In [8]:
haltestellen_df.head()

Unnamed: 0,DIVA,PlatformText,Municipality,MunicipalityID,Longitude,Latitude
0,60200001,Schrankenberggasse,Wien,49000001,48.173801,16.389807
1,60200002,Achengasse,Wien,49000001,48.28446,16.448925
2,60200003,Ada-Christen-Gasse,Wien,49000001,48.15283,16.386043
3,60200004,Adam-Betz-Gasse,Wien,49000001,48.215581,16.535469
4,60200005,Adamovichgasse,Wien,49000001,48.142664,16.338621


### airbnb datasets

#### download and extract gz compressed dataset

In [9]:
def download_and_extract_csv_gz(url, compressed_file, extracted_file, folder_name):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    try:
        # Step 1: Download the compressed file
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(compressed_file, "wb") as f:
                shutil.copyfileobj(response.raw, f)
            print(f"Downloaded: {compressed_file}")
        else:
            print(f"Failed to download the file. Status code: {response.status_code}")
            return None

        # Step 2: Detect file type and extract
        if zipfile.is_zipfile(compressed_file):
            print("Detected a ZIP file. Extracting...")
            with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
                zip_ref.extractall(folder_name)  # Extract all files into the specified folder
                extracted_file = os.path.join(folder_name, zip_ref.namelist()[0])  # Path of the first extracted file
                print(f"Extracted to: {folder_name}")
        else:
            print("Detected a GZIP file. Extracting...")
            extracted_file = os.path.join(folder_name, extracted_file)
            with gzip.open(compressed_file, "rb") as f_in:
                with open(extracted_file, "wb") as f_out:
                    shutil.copyfileobj(f_in, f_out)
            print(f"Extracted to: {extracted_file}")

        # Step 3: Load the CSV into a pandas DataFrame
        data = pd.read_csv(extracted_file)
        print("CSV loaded successfully.")
        return data

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

    finally:
        # Cleanup: Remove the compressed file to save space
        if os.path.exists(compressed_file):
            os.remove(compressed_file)

In [10]:
airbnb_listings_url = "https://data.insideairbnb.com/austria/vienna/vienna/2024-09-11/data/listings.csv.gz"
airbnb_calendar_url = "https://data.insideairbnb.com/austria/vienna/vienna/2024-09-11/data/calendar.csv.gz"
airbnb_neighbourhoods_url = "https://data.insideairbnb.com/austria/vienna/vienna/2024-09-11/visualisations/neighbourhoods.csv"

# airbnb listing dataset

In [11]:
airbnb_listings_df = download_and_extract_csv_gz(airbnb_listings_url, "listings.csv.gz", "listings.csv", "airbnb")
airbnb_listings_df.head()

Downloaded: listings.csv.gz
Detected a GZIP file. Extracting...
Extracted to: airbnb/listings.csv
CSV loaded successfully.


Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,38768,https://www.airbnb.com/rooms/38768,20240911015603,2024-09-11,city scrape,central cityapartement- wifi- nice neighbourhood,39m² apartment with beautiful courtyard of the...,the Karmeliterviertel became very popular in t...,https://a0.muscache.com/pictures/ad4089a3-5355...,166283,...,4.94,4.77,4.69,,t,3,3,0,0,2.38
1,40625,https://www.airbnb.com/rooms/40625,20240911015603,2024-09-11,city scrape,"Near Palace Schönbrunn, Apt. 1",Welcome to my Apt. 1!<br /><br />This is a 2be...,The neighbourhood offers plenty of restaurants...,https://a0.muscache.com/pictures/11509144/d55c...,175131,...,4.94,4.61,4.72,,t,14,13,1,0,1.26
2,51287,https://www.airbnb.com/rooms/51287,20240911015603,2024-09-11,city scrape,little studio- next to citycenter- wifi- nice ...,small studio in new renovated old house and ve...,The neighbourhood has a lot of very nice littl...,https://a0.muscache.com/pictures/25163038/1c4e...,166283,...,4.95,4.87,4.59,,t,3,3,0,0,2.29
3,70637,https://www.airbnb.com/rooms/70637,20240911015603,2024-09-11,city scrape,Flat in the Center with Terrace,,,https://a0.muscache.com/pictures/925691/c8c1bd...,358842,...,4.76,4.81,4.72,,f,3,1,2,0,0.71
4,78416,https://www.airbnb.com/rooms/78416,20240911015603,2024-09-11,city scrape,Nice Apartment in Vienna center,"Newly renovated modern, comfortable apartment ...",Stadthalle – Viennas largest event center is j...,https://a0.muscache.com/pictures/7b8ed252-4016...,421075,...,4.43,4.41,4.47,,t,2,2,0,0,1.09


In [12]:
airbnb_listings_df.loc[:, 'amenities'] = airbnb_listings_df['amenities'].apply(ast.literal_eval)

In [13]:
from collections import Counter

# Calculate the frequency of amenities in the training data
all_amenities = [amenity for amenities_list in airbnb_listings_df['amenities'] for amenity in amenities_list]
amenity_counts = Counter(all_amenities)

# We list the 4 most popular amenities
most_common_amenities = [amenity for amenity, count in amenity_counts.most_common(4)]
print(most_common_amenities)


['Kitchen', 'Wifi', 'Essentials', 'Hair dryer']


In [14]:
# leave only top 4

def filter_amenities(amenities):
    return [amenity for amenity in amenities if amenity in most_common_amenities]

airbnb_listings_df.loc[:, 'amenities'] = airbnb_listings_df['amenities'].apply(filter_amenities)

print(airbnb_listings_df['amenities'].head())

0          [Kitchen, Essentials, Hair dryer]
1    [Wifi, Kitchen, Essentials, Hair dryer]
2    [Wifi, Kitchen, Essentials, Hair dryer]
3    [Wifi, Kitchen, Essentials, Hair dryer]
4    [Wifi, Kitchen, Essentials, Hair dryer]
Name: amenities, dtype: object


In [15]:
airbnb_listings_df.loc[:,'amenities'] = airbnb_listings_df['amenities'].apply(lambda x: ', '.join(x))

# airbnb calender dataset

In [16]:
airbnb_calendar_df = download_and_extract_csv_gz(airbnb_calendar_url, "calendar.csv.gz", "calendar.csv", "airbnb")
airbnb_calendar_df.head()

Downloaded: calendar.csv.gz
Detected a GZIP file. Extracting...
Extracted to: airbnb/calendar.csv
CSV loaded successfully.


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,275668,2024-09-11,f,$59.00,,2,1125
1,275668,2024-09-12,f,$59.00,,2,1125
2,275668,2024-09-13,f,$59.00,,2,1125
3,275668,2024-09-14,f,$59.00,,2,1125
4,275668,2024-09-15,f,$59.00,,2,1125


In [17]:
airbnb_neighbourhoods_data = fetch_csv_from_http(airbnb_neighbourhoods_url)
airbnb_neighbourhoods_df = pd.read_csv(airbnb_neighbourhoods_data)
airbnb_neighbourhoods_df.head()

Unnamed: 0,neighbourhood_group,neighbourhood
0,,Alsergrund
1,,Brigittenau
2,,Dbling
3,,Donaustadt
4,,Favoriten


### merging datasets

In [18]:
merged_airbnb = pd.merge(airbnb_calendar_df, airbnb_listings_df, left_on='listing_id', right_on='id')


In [19]:
merged_airbnb.head()

Unnamed: 0,listing_id,date,available,price_x,adjusted_price,minimum_nights_x,maximum_nights_x,id,listing_url,scrape_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,275668,2024-09-11,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,4.54,4.84,4.73,,t,3,3,0,0,1.97
1,275668,2024-09-12,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,4.54,4.84,4.73,,t,3,3,0,0,1.97
2,275668,2024-09-13,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,4.54,4.84,4.73,,t,3,3,0,0,1.97
3,275668,2024-09-14,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,4.54,4.84,4.73,,t,3,3,0,0,1.97
4,275668,2024-09-15,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,4.54,4.84,4.73,,t,3,3,0,0,1.97


In [20]:
# Convert Listings and Stops to GeoDataFrames
listings_gdf = gpd.GeoDataFrame(merged_airbnb, geometry=gpd.points_from_xy(merged_airbnb.longitude, merged_airbnb.latitude))
stops_gdf = gpd.GeoDataFrame(haltestellen_df, geometry=gpd.points_from_xy(haltestellen_df.Longitude, haltestellen_df.Latitude))

from scipy.spatial import cKDTree
import numpy as np

# Extract coordinates for stops and listings
stops_coords = np.array(list(zip(stops_gdf.geometry.x, stops_gdf.geometry.y)))
listings_coords = np.array(list(zip(listings_gdf.geometry.x, listings_gdf.geometry.y)))

# Build a KDTree for the stops
stops_tree = cKDTree(stops_coords)

# Query the tree for nearest neighbor distances and indices
distances, indices = stops_tree.query(listings_coords)

# Convert distances to kilometers
distances_km = distances / 1000.0

# Add the nearest stop distance (in kilometers) and coordinates to the listings GeoDataFrame
listings_gdf['nearest_stop_distance_km'] = distances_km
listings_gdf['nearest_stop_latitude'] = stops_gdf.iloc[indices].geometry.y.values
listings_gdf['nearest_stop_longitude'] = stops_gdf.iloc[indices].geometry.x.values
listings_gdf['nearest_stop_platform_text'] = stops_gdf.iloc[indices]['PlatformText'].values


# POI dataset

In [21]:
poi_category_map = {
    'bakery': 'cafe_restaurant', 'restaurant': 'cafe_restaurant', 'cafe': 'cafe_restaurant', 
    'fast_food': 'cafe_restaurant', 'bar': 'other', 'supermarket': 'supermarket',
    'vending_machine': 'other', 'beverages': 'other', 'food_court': 'cafe_restaurant',
    'pub': 'other',
    
    'doctors': 'other', 'dentist': 'other', 'clinic': 'other',
    'hospital': 'other', 'pharmacy': 'other', 'veterinary': 'other',
    'beauty_shop': 'other', 'hairdresser': 'other', 'optician': 'other',
    'laundry': 'other',
    
    'toy_shop': 'shops_and_retail', 'furniture_shop': 'other', 'clothes': 'shops_and_retail',
    'jeweller': 'shops_and_retail',
    'bookshop': 'shops_and_retail', 'gift_shop': 'shops_and_retail', 'stationery': 'other',
    'mobile_phone_shop': 'other', 'sports_shop': 'other', 'shoe_shop': 'shops_and_retail',
    'computer_shop': 'other', 'greengrocer': 'other', 'car_dealership': 'other',
    'department_store': 'other', 'outdoor_shop': 'other', 'car_wash': 'other',
    'car_rental': 'other', 'kiosk': 'other', 'bicycle_rental': 'other',
    
    'tourist_info': 'entertainment_leisure', 'artwork': 'entertainment_leisure', 'museum': 'entertainment_leisure',
    'theatre': 'entertainment_leisure', 'cinema': 'other', 'nightclub': 'other',
    'attraction': 'entertainment_leisure', 'sports_centre': 'other', 'park': 'park',
    'playground': 'other', 'community_centre': 'other', 'arts_centre': 'other',
    'library': 'other',
    'picnic_site': 'other', 'garden_centre': 'other', 'vending_parking': 'other',
    
    'waste_basket': 'other', 'camera_surveillance': 'other', 'police': 'other',
    'fire_station': 'other', 'toilet': 'other', 'public_building': 'other',
    'post_box': 'other', 'recycling': 'other', 'recycling_glass': 'other',
    'recycling_paper': 'other', 'recycling_metal': 'other', 'recycling_clothes': 'other',
    'recycling': 'other', 'post_office': 'other', 'town_hall': 'other',
    'shelter': 'other', 'ambulance': 'other', 'court': 'other',
    'prison': 'other',
    
    'school': 'education', 'kindergarten': 'education', 'university': 'education',
    'college': 'education', 'embassy': 'education',
    
    'atm': 'transport_and_infrastructure', 'telephone': 'other', 'bank': 'transport_and_infrastructure',
    'car_sharing': 'other', 'car_wash': 'other', 'vending_cigarette': 'other',
    'vending_any': 'other',
    
    'memorial': 'entertainment_leisure', 'bench': 'other',
    'post_box': 'other', 'wayside_cross': 'other', 'vending_cigarette': 'other',
    'vending_parking': 'other', 'stationery': 'other', 'car_wash': 'other',
}

In [22]:
pois['category'] = pois['fclass'].map(poi_category_map).fillna('other')

print(pois[['osm_id', 'fclass', 'category']])



             osm_id         fclass               category
0          15079895      telephone                  other
1          15079903    supermarket            supermarket
2          15080180    supermarket            supermarket
3          15080251    sports_shop                  other
4          17310328   tourist_info  entertainment_leisure
...             ...            ...                    ...
400434  12492703402  hunting_stand                  other
400435  12492722406   tourist_info  entertainment_leisure
400436  12492748666  hunting_stand                  other
400437  12492809566   tourist_info  entertainment_leisure
400438  12492864632   tourist_info  entertainment_leisure

[400439 rows x 3 columns]


In [23]:
# Coordinates combined_df and pois
combined_coords = listings_gdf[['longitude', 'latitude']].values
poi_coords = np.array([p.coords[0] for p in pois.geometry])

In [24]:
unique_combined_coords = np.unique(combined_coords, axis=0)

In [25]:
# Creating KDTree for POI
tree = KDTree(poi_coords, metric='euclidean')

# Search radius (in degrees; for example ~1 km if latitude/longitude in degrees)
radius = 0.01

# Find all nearest points for each coordinate from combined_df
indices_within_radius = tree.query_radius(unique_combined_coords, r=radius)
coord_to_poi = {}
for coord, indices in zip(unique_combined_coords, indices_within_radius):
    # Store fclass of nearest POI for each unique coordinate
    coord_to_poi[tuple(coord)] = pois.iloc[indices].category.tolist()

In [26]:
# map closest POIs to each row in combined_df
def map_pois(row):
    coord = (row['longitude'], row['latitude'])
    return coord_to_poi.get(coord, [])

In [27]:
listings_gdf['nearest_pois'] = listings_gdf.apply(map_pois, axis=1)

In [28]:
# Remove duplicates within the list for each row in the 'nearest_pois' column
listings_gdf['nearest_pois'] = listings_gdf['nearest_pois'].apply(lambda x: list(set(x)))


In [29]:

listings_gdf['nearest_pois'] = listings_gdf['nearest_pois'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [30]:
listings_gdf.head()

Unnamed: 0,listing_id,date,available,price_x,adjusted_price,minimum_nights_x,maximum_nights_x,id,listing_url,scrape_id,...,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,geometry,nearest_stop_distance_km,nearest_stop_latitude,nearest_stop_longitude,nearest_stop_platform_text,nearest_pois
0,275668,2024-09-11,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,0,0,1.97,POINT (16.36755 48.19328),0.044885,16.549312,48.200063,GroÃ-Enzersdorf Busbahnhof,"other, cafe_restaurant, transport_and_infrastr..."
1,275668,2024-09-12,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,0,0,1.97,POINT (16.36755 48.19328),0.044885,16.549312,48.200063,GroÃ-Enzersdorf Busbahnhof,"other, cafe_restaurant, transport_and_infrastr..."
2,275668,2024-09-13,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,0,0,1.97,POINT (16.36755 48.19328),0.044885,16.549312,48.200063,GroÃ-Enzersdorf Busbahnhof,"other, cafe_restaurant, transport_and_infrastr..."
3,275668,2024-09-14,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,0,0,1.97,POINT (16.36755 48.19328),0.044885,16.549312,48.200063,GroÃ-Enzersdorf Busbahnhof,"other, cafe_restaurant, transport_and_infrastr..."
4,275668,2024-09-15,f,$59.00,,2,1125,275668,https://www.airbnb.com/rooms/275668,20240911015603,...,3,0,0,1.97,POINT (16.36755 48.19328),0.044885,16.549312,48.200063,GroÃ-Enzersdorf Busbahnhof,"other, cafe_restaurant, transport_and_infrastr..."


## preprocessing

In [31]:
df = listings_gdf

# General information about the dataset
print(df.info())

# Statistics on numeric columns
print(df.describe())

# Check for gaps
print(df.isna().sum())

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5254476 entries, 0 to 5254475
Data columns (total 88 columns):
 #   Column                                        Dtype   
---  ------                                        -----   
 0   listing_id                                    int64   
 1   date                                          object  
 2   available                                     object  
 3   price_x                                       object  
 4   adjusted_price                                float64 
 5   minimum_nights_x                              int64   
 6   maximum_nights_x                              int64   
 7   id                                            int64   
 8   listing_url                                   object  
 9   scrape_id                                     int64   
 10  last_scraped                                  object  
 11  source                                        object  
 12  name                              

### cleaning dataset

In [32]:
# Removing unnecessary columns
columns_to_drop = [
    'scrape_id', 'last_scraped', 'id', 'listing_url', 'picture_url', 
    'host_url', 'host_thumbnail_url', 'host_picture_url', 'host_about', 
    'calendar_updated', 'calendar_last_scraped', 'host_verifications', 
    'neighbourhood_group_cleansed', 'bathrooms_text', 'instant_bookable', 
    'source', 'available', 'adjusted_price', 'name', 'description', 'neighborhood_overview',
    'host_id',	'host_name', 'host_location', 'host_neighbourhood', 'neighbourhood',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
    'license', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms', 'nearest_stop_latitude', 'nearest_stop_longitude', 'minimum_minimum_nights', 
    'maximum_minimum_nights',
    'minimum_maximum_nights', 'maximum_maximum_nights', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'host_listings_count', 'host_total_listings_count', 'geometry',
    'availability_30', 'availability_60', 'availability_90', 'availability_365',
    'latitude', 'longitude', 'room_type', 'beds', 'price_y', 'minimum_nights_y', 'maximum_nights_y', 'nearest_stop_platform_text'
]

df = listings_gdf.drop(columns=columns_to_drop)

In [33]:
df.head()

Unnamed: 0,listing_id,date,price_x,minimum_nights_x,maximum_nights_x,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,reviews_per_month,nearest_stop_distance_km,nearest_pois
0,275668,2024-09-11,$59.00,2,1125,2011-11-25,within an hour,100%,100%,f,...,4.8,4.69,4.83,4.54,4.84,4.73,3,1.97,0.044885,"other, cafe_restaurant, transport_and_infrastr..."
1,275668,2024-09-12,$59.00,2,1125,2011-11-25,within an hour,100%,100%,f,...,4.8,4.69,4.83,4.54,4.84,4.73,3,1.97,0.044885,"other, cafe_restaurant, transport_and_infrastr..."
2,275668,2024-09-13,$59.00,2,1125,2011-11-25,within an hour,100%,100%,f,...,4.8,4.69,4.83,4.54,4.84,4.73,3,1.97,0.044885,"other, cafe_restaurant, transport_and_infrastr..."
3,275668,2024-09-14,$59.00,2,1125,2011-11-25,within an hour,100%,100%,f,...,4.8,4.69,4.83,4.54,4.84,4.73,3,1.97,0.044885,"other, cafe_restaurant, transport_and_infrastr..."
4,275668,2024-09-15,$59.00,2,1125,2011-11-25,within an hour,100%,100%,f,...,4.8,4.69,4.83,4.54,4.84,4.73,3,1.97,0.044885,"other, cafe_restaurant, transport_and_infrastr..."


## changing data type

In [34]:
df['price_x'] = df['price_x'].replace({'\$': '', ',': ''}, regex=True).astype(float)

df['date'] = pd.to_datetime(df['date'])
df['host_since'] = pd.to_datetime(df['host_since'])

df['host_response_rate'] = df['host_response_rate'].replace({'%': ''}, regex=True).astype(float)
df['host_acceptance_rate'] = df['host_acceptance_rate'].replace({'%': ''}, regex=True).astype(float)

df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].astype('category')
df['property_type'] = df['property_type'].astype('category')

## split the datset into train, val, test

In [35]:
# Splitting data by features
from sklearn.model_selection import train_test_split

# Unique listing_id
listing_ids = df['listing_id'].unique()

# Splitting features into training, validation, and testing
train_ids, test_ids = train_test_split(listing_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(test_ids, test_size=0.5, random_state=42)

# Creating masks for samples
train_data = df[df['listing_id'].isin(train_ids)]
val_data = df[df['listing_id'].isin(val_ids)]
test_data = df[df['listing_id'].isin(test_ids)]

## preprocess numeric columns

In [36]:
numeric_columns = df.select_dtypes(include=['number']).columns
print(numeric_columns)

Index(['listing_id', 'price_x', 'minimum_nights_x', 'maximum_nights_x',
       'host_response_rate', 'host_acceptance_rate', 'accommodates',
       'bathrooms', 'bedrooms', 'number_of_reviews', 'review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value',
       'calculated_host_listings_count', 'reviews_per_month',
       'nearest_stop_distance_km'],
      dtype='object')


In [37]:
# Check for missing values ​​for numeric columns
missing_values = train_data[numeric_columns].isnull().sum()
print(missing_values)

listing_id                             0
price_x                                0
minimum_nights_x                       0
maximum_nights_x                       0
host_response_rate                936582
host_acceptance_rate              797152
accommodates                           0
bathrooms                         963576
bedrooms                          365730
number_of_reviews                      0
review_scores_rating              612430
review_scores_accuracy            612430
review_scores_cleanliness         612430
review_scores_checkin             612430
review_scores_communication       612430
review_scores_location            612430
review_scores_value               612795
calculated_host_listings_count         0
reviews_per_month                 612430
nearest_stop_distance_km               0
dtype: int64


In [38]:
# replace values in host_response_rate (if nan or < 100 -> 0, if 100 -> 1)

import numpy as np

def categorize_vectorized(values):
    categories = np.zeros(len(values), dtype=int)
    categories[values < 100] = 0
    categories[values == 100] = 1
    return categories

train_data.loc[:, 'host_response_rate'] = categorize_vectorized(train_data['host_response_rate'].fillna(-1).values)
val_data.loc[:, 'host_response_rate'] = categorize_vectorized(val_data['host_response_rate'].fillna(-1).values)
test_data.loc[:, 'host_response_rate'] = categorize_vectorized(test_data['host_response_rate'].fillna(-1).values)

In [39]:
# replace values in host_acceptance_rate (if nan or < 100 -> 0, if 100 -> 1)

train_data.loc[:, 'host_acceptance_rate'] = categorize_vectorized(train_data['host_acceptance_rate'].fillna(-1).values)
val_data.loc[:, 'host_acceptance_rate'] = categorize_vectorized(val_data['host_acceptance_rate'].fillna(-1).values)
test_data.loc[:, 'host_acceptance_rate'] = categorize_vectorized(test_data['host_acceptance_rate'].fillna(-1).values)

In [40]:
# If there is no bathroom data, replace missing values ​​with 0

train_data.loc[:,'bathrooms'] = train_data['bathrooms'].fillna(0)
val_data.loc[:,'bathrooms'] = val_data['bathrooms'].fillna(0)
test_data.loc[:,'bathrooms'] = test_data['bathrooms'].fillna(0)

In [41]:
# If there is no bedroom data, replace missing values ​​with 1

train_data.loc[:,'bedrooms'] = train_data['bedrooms'].fillna(1)
val_data.loc[:,'bedrooms'] = val_data['bedrooms'].fillna(1)
test_data.loc[:,'bedrooms'] = test_data['bedrooms'].fillna(1)

In [42]:
# If there is no review_score data, replace missing values ​​with 0

review_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                  'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 
                  'review_scores_value', 'reviews_per_month']

for df in [train_data, val_data, test_data]:
    df.loc[:, review_columns] = df[review_columns].fillna(0)

## preprocess categorical columns

In [43]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
print(categorical_columns)

Index(['host_response_time', 'host_is_superhost', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood_cleansed', 'property_type',
       'amenities', 'nearest_pois'],
      dtype='object')


In [44]:
# Check for missing values ​​for categorical columns
missing_values_cat = train_data[categorical_columns].isnull().sum()
print(missing_values_cat)

host_response_time        936582
host_is_superhost         138700
host_has_profile_pic         365
host_identity_verified       365
neighbourhood_cleansed         0
property_type                  0
amenities                      0
nearest_pois                   0
dtype: int64


In [45]:
# Remove rows where 'id' is 7079941
train_data = train_data[train_data['listing_id'] != 7079941]
val_data = val_data[val_data['listing_id'] != 7079941]
test_data = test_data[test_data['listing_id'] != 7079941]

In [46]:
missing_values_cat = train_data[categorical_columns].isnull().sum()
print(missing_values_cat)

host_response_time        936217
host_is_superhost         138700
host_has_profile_pic           0
host_identity_verified         0
neighbourhood_cleansed         0
property_type                  0
amenities                      0
nearest_pois                   0
dtype: int64


In [47]:
# Applying One-Hot Encoding with .loc
train_data = pd.get_dummies(train_data, columns=['host_response_time'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['host_response_time'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['host_response_time'], drop_first=True)

In [48]:
# Replace NaN with 'f' using .loc
train_data.loc[:, 'host_is_superhost'] = train_data['host_is_superhost'].fillna('f')
val_data.loc[:, 'host_is_superhost'] = val_data['host_is_superhost'].fillna('f')
test_data.loc[:, 'host_is_superhost'] = test_data['host_is_superhost'].fillna('f')

# Apply One-Hot Encoding using get_dummies
train_data = pd.get_dummies(train_data, columns=['host_is_superhost'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['host_is_superhost'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['host_is_superhost'], drop_first=True)


In [49]:
# Apply One-Hot Encoding using get_dummies for both columns
train_data = pd.get_dummies(train_data, columns=['host_has_profile_pic'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['host_has_profile_pic'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['host_has_profile_pic'], drop_first=True)

train_data = pd.get_dummies(train_data, columns=['host_identity_verified'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['host_identity_verified'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['host_identity_verified'], drop_first=True)

In [50]:
train_data['neighbourhood_cleansed'] = train_data['neighbourhood_cleansed'].str.replace('Dbling', 'Dobling')
val_data['neighbourhood_cleansed'] = val_data['neighbourhood_cleansed'].str.replace('Dbling', 'Dobling')
test_data['neighbourhood_cleansed'] = test_data['neighbourhood_cleansed'].str.replace('Dbling', 'Dobling')

train_data['neighbourhood_cleansed'] = train_data['neighbourhood_cleansed'].str.replace('Whring', 'Wahring')
val_data['neighbourhood_cleansed'] = val_data['neighbourhood_cleansed'].str.replace('Whring', 'Wahring')
test_data['neighbourhood_cleansed'] = test_data['neighbourhood_cleansed'].str.replace('Whring', 'Wahring')

train_data['neighbourhood_cleansed'] = train_data['neighbourhood_cleansed'].str.replace('Rudolfsheim-Fnfhaus', 'Rudolfsheim-Funfhaus')
val_data['neighbourhood_cleansed'] = val_data['neighbourhood_cleansed'].str.replace('Rudolfsheim-Fnfhaus', 'Rudolfsheim-Funfhaus')
test_data['neighbourhood_cleansed'] = test_data['neighbourhood_cleansed'].str.replace('Rudolfsheim-Fnfhaus', 'Rudolfsheim-Funfhaus')


In [51]:
train_data = pd.get_dummies(train_data, columns=['neighbourhood_cleansed'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['neighbourhood_cleansed'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['neighbourhood_cleansed'], drop_first=True)

In [52]:
# Function to merge categories in property types
def reduce_property_type(property_type):
    if 'Entire' in property_type:
        return 'Entire'
    elif 'Private room' in property_type:
        return 'Private room'
    elif 'Shared room' in property_type:
        return 'Shared room'
    elif 'Room in' in property_type:
        return 'Room in hotel or similar'
    else:
        return 'Other'

# Apply the function with .loc
train_data.loc[:, 'property_type'] = train_data['property_type'].apply(reduce_property_type)
val_data.loc[:, 'property_type'] = val_data['property_type'].apply(reduce_property_type)
test_data.loc[:, 'property_type'] = test_data['property_type'].apply(reduce_property_type)

# Apply One-Hot Encoding with .loc
train_data = pd.get_dummies(train_data, columns=['property_type'], drop_first=True)
val_data = pd.get_dummies(val_data, columns=['property_type'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['property_type'], drop_first=True)

In [53]:
#  One-Hot Encoding
train_encoded = train_data['nearest_pois'].str.get_dummies(sep=', ')
val_encoded = val_data['nearest_pois'].str.get_dummies(sep=', ')
test_encoded = test_data['nearest_pois'].str.get_dummies(sep=', ')

# Append the encoded data back to the original DataFrame
train_data = pd.concat([train_data, train_encoded], axis=1)
val_data = pd.concat([val_data, val_encoded], axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

In [54]:
# drop column 'nearest_pois' from DataFrame
train_data = train_data.drop(columns=['nearest_pois'])
val_data = val_data.drop(columns=['nearest_pois'])
test_data = test_data.drop(columns=['nearest_pois'])

In [55]:
#  One-Hot Encoding
train_encoded = train_data['amenities'].str.get_dummies(sep=', ')
val_encoded = val_data['amenities'].str.get_dummies(sep=', ')
test_encoded = test_data['amenities'].str.get_dummies(sep=', ')

# Append the encoded data back to the original DataFrame
train_data = pd.concat([train_data, train_encoded], axis=1)
val_data = pd.concat([val_data, val_encoded], axis=1)
test_data = pd.concat([test_data, test_encoded], axis=1)

In [56]:
train_data.columns

Index(['listing_id', 'date', 'price_x', 'minimum_nights_x', 'maximum_nights_x',
       'host_since', 'host_response_rate', 'host_acceptance_rate',
       'accommodates', 'bathrooms', 'bedrooms', 'amenities',
       'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'calculated_host_listings_count',
       'reviews_per_month', 'nearest_stop_distance_km',
       'host_response_time_within a day',
       'host_response_time_within a few hours',
       'host_response_time_within an hour', 'host_is_superhost_t',
       'host_has_profile_pic_t', 'host_identity_verified_t',
       'neighbourhood_cleansed_Brigittenau', 'neighbourhood_cleansed_Dobling',
       'neighbourhood_cleansed_Donaustadt', 'neighbourhood_cleansed_Favoriten',
       'neighbourhood_cleansed_Floridsdorf', 'neighbourhood_cleansed_Hernals',
       'neigh

In [57]:
# Save the datasets to CSV files
train_data.to_csv('train_data_processed.csv', index=False)
val_data.to_csv('val_data_processed.csv', index=False)
test_data.to_csv('test_data_processed.csv', index=False)

In [58]:
train_data.head()

Unnamed: 0,listing_id,date,price_x,minimum_nights_x,maximum_nights_x,host_since,host_response_rate,host_acceptance_rate,accommodates,bathrooms,...,entertainment_leisure,other,park,shops_and_retail,supermarket,transport_and_infrastructure,Essentials,Hair dryer,Kitchen,Wifi
365,729796,2024-09-11,42.0,3,30,2012-10-07,0.0,0.0,5,0.0,...,1,1,0,1,1,1,0,1,1,1
366,729796,2024-09-12,42.0,3,30,2012-10-07,0.0,0.0,5,0.0,...,1,1,0,1,1,1,0,1,1,1
367,729796,2024-09-13,42.0,3,30,2012-10-07,0.0,0.0,5,0.0,...,1,1,0,1,1,1,0,1,1,1
368,729796,2024-09-14,42.0,3,30,2012-10-07,0.0,0.0,5,0.0,...,1,1,0,1,1,1,0,1,1,1
369,729796,2024-09-15,42.0,3,30,2012-10-07,0.0,0.0,5,0.0,...,1,1,0,1,1,1,0,1,1,1


In [59]:
# Check for missing values in the entire dataset
missing_values = train_data.isnull().sum()

# Print columns with missing values and their count
print(missing_values[missing_values > 0])

Series([], dtype: int64)


In [60]:
# Renaming columns
train_data.columns = train_data.columns.str.replace('host_is_superhost_t', 'is_superhost')
train_data.columns = train_data.columns.str.replace('host_has_profile_pic_t', 'host_has_profile_pic')
train_data.columns = train_data.columns.str.replace('host_identity_verified_t', 'host_identity_verified')
train_data.columns = train_data.columns.str.replace('neighbourhood_cleansed_', 'neighbourhood_')
train_data.columns = train_data.columns.str.replace('property_type_', 'property_type_')

val_data.columns = val_data.columns.str.replace('host_is_superhost_t', 'is_superhost')
val_data.columns = val_data.columns.str.replace('host_has_profile_pic_t', 'host_has_profile_pic')
val_data.columns = val_data.columns.str.replace('host_identity_verified_t', 'host_identity_verified')
val_data.columns = val_data.columns.str.replace('neighbourhood_cleansed_', 'neighbourhood_')
val_data.columns = val_data.columns.str.replace('property_type_', 'property_type_')

test_data.columns = test_data.columns.str.replace('host_is_superhost_t', 'is_superhost')
test_data.columns = test_data.columns.str.replace('host_has_profile_pic_t', 'host_has_profile_pic')
test_data.columns = test_data.columns.str.replace('host_identity_verified_t', 'host_identity_verified')
test_data.columns = test_data.columns.str.replace('neighbourhood_cleansed_', 'neighbourhood_')
test_data.columns = test_data.columns.str.replace('property_type_', 'property_type_')

train_data.columns = train_data.columns.str.replace('price_x', 'price')
train_data.columns = train_data.columns.str.replace('minimum_nights_x', 'min_nights')
train_data.columns = train_data.columns.str.replace('maximum_nights_x', 'max_nights')
train_data.columns = train_data.columns.str.replace('host_response_rate', 'host_response_rate')
train_data.columns = train_data.columns.str.replace('host_acceptance_rate', 'host_acceptance_rate')
train_data.columns = train_data.columns.str.replace('calculated_host_listings_count', 'host_has_listings')

val_data.columns = val_data.columns.str.replace('price_x', 'price')
val_data.columns = val_data.columns.str.replace('minimum_nights_x', 'min_nights')
val_data.columns = val_data.columns.str.replace('maximum_nights_x', 'max_nights')
val_data.columns = val_data.columns.str.replace('host_response_rate', 'host_response_rate')
val_data.columns = val_data.columns.str.replace('host_acceptance_rate', 'host_acceptance_rate')
val_data.columns = val_data.columns.str.replace('calculated_host_listings_count', 'host_has_listings')

test_data.columns = test_data.columns.str.replace('price_x', 'price')
test_data.columns = test_data.columns.str.replace('minimum_nights_x', 'min_nights')
test_data.columns = test_data.columns.str.replace('maximum_nights_x', 'max_nights')
test_data.columns = test_data.columns.str.replace('host_response_rate', 'host_response_rate')
test_data.columns = test_data.columns.str.replace('host_acceptance_rate', 'host_acceptance_rate')
test_data.columns = test_data.columns.str.replace('calculated_host_listings_count', 'host_has_listings')


train_data.columns = train_data.columns.str.replace(' ', '_')
val_data.columns = val_data.columns.str.replace(' ', '_')
test_data.columns = test_data.columns.str.replace(' ', '_')

print(train_data.columns)

Index(['listing_id', 'date', 'price', 'min_nights', 'max_nights', 'host_since',
       'host_response_rate', 'host_acceptance_rate', 'accommodates',
       'bathrooms', 'bedrooms', 'amenities', 'number_of_reviews',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'host_has_listings', 'reviews_per_month',
       'nearest_stop_distance_km', 'host_response_time_within_a_day',
       'host_response_time_within_a_few_hours',
       'host_response_time_within_an_hour', 'is_superhost',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_Brigittenau', 'neighbourhood_Dobling',
       'neighbourhood_Donaustadt', 'neighbourhood_Favoriten',
       'neighbourhood_Floridsdorf', 'neighbourhood_Hernals',
       'neighbourhood_Hietzing', 'neighbourhood_Innere_Stadt',
       'neighbourhood_Josefstadt', 'neighbourhood

In [None]:
# Save the datasets to CSV files
train_data.to_csv('train_data_processed.csv', index=False)
val_data.to_csv('val_data_processed.csv', index=False)
test_data.to_csv('test_data_processed.csv', index=False)

In [None]:
# Load the datasets from CSV files
train_data = pd.read_csv('train_data_processed.csv')
val_data = pd.read_csv('val_data_processed.csv')
test_data = pd.read_csv('test_data_processed.csv')

# Check the loaded datasets
print(train_data.head())
print(val_data.head())
print(test_data.head())