# Evaluation Preprocessing
Generate dataset containing all grid cell Ids for small, large and voronoi grid cells. This dataset will be used to evaluate the performance of the model.

In [13]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import box
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [14]:
# variables
input_file_path = './pickles/gridded_reservations_voronoi.pickle'
voronoi_grid_path = './pickles/voronoi_cells.pkl'


num_of_clusters = 20
random_state = 42

small_grid_size = 200
large_grid_size = 1000

training_size = 0.7
testing_size = 0.2
validation_size = 0.1

train_output_file_path = './pickles/reservations_training.pickle'
test_output_file_path = './pickles/reservations_testing.pickle'
val_output_file_path = './pickles/reservations_validation.pickle'

In [15]:
# import reservation data
reservations = pd.read_pickle(input_file_path)
reservations.drop(columns=['start_dwpt', 'end_dwpt', 'start_rhum', 'end_rhum', 'start_snow', 'end_snow', 'start_wdir', 'end_wdir', 'start_wspd', 'end_wspd', 'start_pres', 'end_pres','start_tsun', 'end_tsun', 'end_tsun', 'start_coco', 'end_coco', 'start_wpgt', 'end_wpgt'], inplace=True)
reservations.head()

Unnamed: 0,startTime,endTime,startLat,startLon,endLat,endLon,start_temp,start_prcp,end_temp,end_prcp,startGridId,endGridId
0,2017-10-14 14:15:35,2017-10-14 14:38:17,48.782722,9.181855,48.790642,9.171309,20.8,0.0,20.8,0.0,58.0,107.0
1,2017-10-14 14:22:14,2017-10-14 14:51:56,48.771416,9.15748,48.771828,9.156966,20.8,0.0,20.8,0.0,543.0,543.0
2,2017-10-14 14:24:42,2017-10-14 14:38:52,48.774971,9.129474,48.778435,9.127503,20.8,0.0,20.8,0.0,781.0,
3,2017-10-14 14:25:34,2017-10-14 14:42:13,48.785652,9.159463,48.793102,9.159768,20.8,0.0,20.8,0.0,658.0,
4,2017-10-14 14:25:54,2017-10-14 14:44:53,48.784927,9.153793,48.784882,9.206255,20.8,0.0,20.8,0.0,487.0,466.0


In [16]:
reservations['voronoi_grid_id'] = reservations['startGridId']
reservations['temperature'] = reservations[['start_temp', 'end_temp']].mean(axis=1).fillna(0)
reservations['precipitation'] = reservations[['start_prcp', 'end_prcp']].mean(axis=1).fillna(0)
reservations.drop(columns=['startGridId', 'endGridId', 'start_temp', 'end_temp', 'start_prcp', 'end_prcp'], inplace=True)
reservations.dropna(inplace=True)
reservations['voronoi_grid_id'] = reservations['voronoi_grid_id'].astype(int)
reservations.head()

Unnamed: 0,startTime,endTime,startLat,startLon,endLat,endLon,voronoi_grid_id,temperature,precipitation
0,2017-10-14 14:15:35,2017-10-14 14:38:17,48.782722,9.181855,48.790642,9.171309,58,20.8,0.0
1,2017-10-14 14:22:14,2017-10-14 14:51:56,48.771416,9.15748,48.771828,9.156966,543,20.8,0.0
2,2017-10-14 14:24:42,2017-10-14 14:38:52,48.774971,9.129474,48.778435,9.127503,781,20.8,0.0
3,2017-10-14 14:25:34,2017-10-14 14:42:13,48.785652,9.159463,48.793102,9.159768,658,20.8,0.0
4,2017-10-14 14:25:54,2017-10-14 14:44:53,48.784927,9.153793,48.784882,9.206255,487,20.8,0.0


In [17]:
def create_grid(size_in_meters, reservation_data):
    # Convert size in meters to degrees
    size_in_degrees = size_in_meters / 111000  # Approximate conversion factor at the equator

    # Convert the start and end locations to GeoDataFrames
    start_gdf = gpd.GeoDataFrame(reservation_data, geometry=gpd.points_from_xy(reservation_data.startLon, reservation_data.startLat), crs='EPSG:4326')
    end_gdf = gpd.GeoDataFrame(reservation_data, geometry=gpd.points_from_xy(reservation_data.endLon, reservation_data.endLat), crs='EPSG:4326')

    # Get the bounds of the start and end locations
    minx_start, miny_start, maxx_start, maxy_start = start_gdf.geometry.total_bounds
    minx_end, miny_end, maxx_end, maxy_end = end_gdf.geometry.total_bounds

    # Use the minimum and maximum values from the start and end bounds as the overall bounds
    minx = min(minx_start, minx_end)
    miny = min(miny_start, miny_end)
    maxx = max(maxx_start, maxx_end)
    maxy = max(maxy_start, maxy_end)

    # Create the grid
    x_range = np.arange(minx, maxx, size_in_degrees)
    y_range = np.arange(miny, maxy, size_in_degrees)
    grid = []
    for x in x_range:
        for y in y_range:
            grid.append(box(x, y, x+size_in_degrees, y+size_in_degrees))

    # Create a GeoDataFrame from the grid
    grid_gdf = gpd.GeoDataFrame(grid, columns=['geometry'], crs='EPSG:4326')

    return grid_gdf

In [18]:
def map_to_grid(grid, reservation_data):
    # Convert the reservations data to a GeoDataFrame
    reservations_gdf = gpd.GeoDataFrame(reservation_data, geometry=gpd.points_from_xy(reservation_data.startLon, reservation_data.startLat), crs='EPSG:4326')

    # Map the start locations to the grid
    reservations_gdf = reservations_gdf.sjoin(grid, how='left')
    reservations_gdf.rename(columns={'index_right': 'grid_id'}, inplace=True)

    return pd.DataFrame(reservations_gdf)

In [19]:
grid_small = create_grid(small_grid_size, reservations)
grid_large = create_grid(large_grid_size, reservations)

reservations = map_to_grid(grid_small, reservations)
reservations['small_grid_id'] = reservations['grid_id']
reservations.drop(columns=['grid_id'], inplace=True)

reservations = map_to_grid(grid_large, reservations)
reservations['large_grid_id'] = reservations['grid_id']
reservations.drop(columns=['grid_id', 'geometry'], inplace=True)
reservations.head()

Unnamed: 0,startTime,endTime,startLat,startLon,endLat,endLon,voronoi_grid_id,temperature,precipitation,small_grid_id,large_grid_id
0,2017-10-14 14:15:35,2017-10-14 14:38:17,48.782722,9.181855,48.790642,9.171309,58,20.8,0.0,4549,178
1,2017-10-14 14:22:14,2017-10-14 14:51:56,48.771416,9.15748,48.771828,9.156966,543,20.8,0.0,3353,126
2,2017-10-14 14:24:42,2017-10-14 14:38:52,48.774971,9.129474,48.778435,9.127503,781,20.8,0.0,2080,76
3,2017-10-14 14:25:34,2017-10-14 14:42:13,48.785652,9.159463,48.793102,9.159768,658,20.8,0.0,3446,145
4,2017-10-14 14:25:54,2017-10-14 14:44:53,48.784927,9.153793,48.784882,9.206255,487,20.8,0.0,3190,128


# Community Clustering

In [20]:
def create_communities(reservation_data, grid_column_name):
    cc_reservation_data = reservation_data[[grid_column_name, 'startTime']].copy()
    cc_reservation_data['hour'] = cc_reservation_data['startTime'].dt.hour
    cc_reservation_data['day'] = cc_reservation_data['startTime'].dt.day
    cc_reservation_data['month'] = cc_reservation_data['startTime'].dt.month
    cc_reservation_data['year'] = cc_reservation_data['startTime'].dt.year
    cc_reservation_data['weekday'] = cc_reservation_data['startTime'].dt.weekday
    
    cc_data_agg = cc_reservation_data.groupby(grid_column_name).agg({
        'hour': ['mean', 'std'],
        'day': ['mean', 'std'],
        'month': ['mean', 'std'],
        'year': 'count',
        'weekday': ['mean', 'std']
    }).reset_index()
    cc_data_agg.columns = ['grid_id', 'hour_mean', 'hour_std', 'day_mean', 'day_std', 
                        'month_mean', 'month_std', 'year_count', 'weekday_mean', 'weekday_std']
    cc_data_agg.fillna(0, inplace=True)
    
    features = cc_data_agg.drop('grid_id', axis=1)

    # Normalisierung der Daten
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(features)
    
    # K-Means Clustering
    kmeans = KMeans(n_clusters=num_of_clusters, random_state=random_state, n_init='auto')
    cc_data_agg['community_'+grid_column_name] = kmeans.fit_predict(x_scaled)
    
    return cc_data_agg[['grid_id', 'community_'+grid_column_name]]

In [21]:
reservations = pd.merge(reservations, create_communities(reservations, 'small_grid_id'), left_on='small_grid_id', right_on='grid_id', how='left')
reservations = pd.merge(reservations, create_communities(reservations, 'voronoi_grid_id'), left_on='voronoi_grid_id', right_on='grid_id', how='left')
reservations.drop(columns=['grid_id_x', 'grid_id_y'], inplace=True)
reservations.head()

Unnamed: 0,startTime,endTime,startLat,startLon,endLat,endLon,voronoi_grid_id,temperature,precipitation,small_grid_id,large_grid_id,community_small_grid_id,community_voronoi_grid_id
0,2017-10-14 14:15:35,2017-10-14 14:38:17,48.782722,9.181855,48.790642,9.171309,58,20.8,0.0,4549,178,12,16
1,2017-10-14 14:22:14,2017-10-14 14:51:56,48.771416,9.15748,48.771828,9.156966,543,20.8,0.0,3353,126,12,4
2,2017-10-14 14:24:42,2017-10-14 14:38:52,48.774971,9.129474,48.778435,9.127503,781,20.8,0.0,2080,76,13,7
3,2017-10-14 14:25:34,2017-10-14 14:42:13,48.785652,9.159463,48.793102,9.159768,658,20.8,0.0,3446,145,13,6
4,2017-10-14 14:25:54,2017-10-14 14:44:53,48.784927,9.153793,48.784882,9.206255,487,20.8,0.0,3190,128,11,10


In [22]:
reservations_train, reservations_test, reservations_val = np.split(reservations.sample(frac=1, random_state=random_state), [int(training_size*len(reservations)), int((training_size+testing_size)*len(reservations))])

  return bound(*args, **kwds)


In [23]:
reservations_train.to_pickle(train_output_file_path)
reservations_test.to_pickle(test_output_file_path)
reservations_val.to_pickle(val_output_file_path)