#### Setup

In [1]:
import pandas as pd
import funda_data_processing_functions as fdp
from tqdm import tqdm
import matplotlib.pyplot as plt
import googlemaps
import folium
from datetime import datetime

from config import google_api_key
gmaps = googlemaps.Client(key = google_api_key)

raw_data = pd.read_csv('no_duplicates.csv')

#### General Preprocessing

In [None]:
# Preprocess the data
df = fdp.preprocess_data(raw_data, is_past=True)
df.shape

#### Geocoding using Google Maps API (Address ---> Coordinates)

In [None]:
# If true, all, if false, only the NaNs will be geocoded
#df = df[:50] # for testing on less data
df = fdp.geocode_addresses(df, geocode_all=True)

In [15]:
# drop the rows with NaNs
df = df.dropna(subset=['latitude', 'longitude'])

##### Add biking time to city center
*Could easily be adapted to walking or driving times by changing the `mode` in the function.*

In [None]:
df = fdp.add_biking_time(df, calculate_for_missing_only=True)

#### Listings as Points on a Map

In [None]:
# All Listings with extra info on click

# Create a base map centered on the approximate center of your data
f = folium.Figure(width=900, height=400)

m = folium.Map(location=[df['latitude'].mean(), df['longitude'].mean()],
               zoom_start=12, tiles='CartoDB Voyager').add_to(f)

# Add markers for each set of coordinates with popup for biking time
for idx, row in df.iterrows():
    # Prepare the popup text
    popup_html = f"""
    <strong>Biking Time:</strong> {round(row['biking_time'])} minutes<br>
    <strong>Price Sold:</strong> {row['price_sold']}€
    """
    popup = folium.Popup(popup_html, parse_html=False)
    
    folium.Circle([row['latitude'], row['longitude']],
                  radius=1,
                  weight=4,
                  color='darkblue',
                  popup=popup).add_to(m)

# Display the map
m

**Further, more specific processing**

Converting to categoricals

In [None]:
df.columns

In [5]:
columns_to_delete = ['price', 'listed_since', 'zip_code', 'size', 'year', 'kind_of_house', 'num_of_rooms', 'num_of_bathrooms', 'layout', 'ownership', 'exteriors', 'date_list', 'term', 'last_ask_price', 'last_ask_price_m2', 'log_id']
df_reduced = df.drop(columns=columns_to_delete)

In [None]:
df_reduced.columns

**Categoricals with correct Reference Levels**

In [7]:
df_reduced['building_type'] = df_reduced['building_type'].astype('category')
df_reduced['building_type'] = df_reduced['building_type'].cat.set_categories(['Bestaande bouw', 'Nieuwbouw'])

df_reduced['energy_label'] = df_reduced['energy_label'].astype('category')
df_reduced['energy_label'] = df_reduced['energy_label'].cat.set_categories(['C', '>A+', 'A', 'B', 'D', 'E', 'F', 'G'])

df_reduced['house_type'] = df_reduced['house_type'].astype('category')
df_reduced['house_type'] = df_reduced['house_type'].cat.set_categories(['appartement', 'huis'])

In [None]:
df_reduced.energy_label.value_counts()

#### Export to CSV

In [9]:
df_reduced.to_csv('funda_all_cleaned_may_23.csv', index=False)