In [None]:
%pip install srai[torch]

In [None]:
import geopandas as gpd
import pandas as pd
import zipfile

from tqdm import tqdm
import pydeck as pdk
import h3

In [None]:
with zipfile.ZipFile('../../data/trips_hexes.zip', "r") as zf:
    for member in tqdm(zf.infolist(), desc=""):
        try:
            zf.extract(member, 'data')
        except zipfile.error:
            pass

In [None]:
taxi_trips_h3 = pd.read_csv('../../data/trips_hexes.csv')
print(taxi_trips_h3.min(), taxi_trips_h3.max())
taxi_trips_h3

In [None]:
h3.get_resolution(taxi_trips_h3['start_hex'].iloc[0])

In [None]:
taxi_trips_h3.hist()

In [None]:
taxi_trips_h3['start_point'] = taxi_trips_h3['start_hex'].apply(h3.cell_to_latlng)
taxi_trips_h3['end_point'] = taxi_trips_h3['end_hex'].apply(h3.cell_to_latlng)
taxi_trips_h3

In [None]:
taxi_trips_h3['start_lat'], taxi_trips_h3['start_lon'] = zip(*taxi_trips_h3['start_point'])
taxi_trips_h3['end_lat'], taxi_trips_h3['end_lon'] = zip(*taxi_trips_h3['end_point'])
taxi_trips_h3

In [None]:
taxi_trips_h3["trips_normalized"] = (
    (taxi_trips_h3["trips"] - taxi_trips_h3["trips"].min())
    / (taxi_trips_h3["trips"].max() - taxi_trips_h3["trips"].min())
)

In [None]:
arc_layer = pdk.Layer(
    "ArcLayer",
#     data=taxi_trips_h3.sample(frac=0.1),
    data=taxi_trips_h3,
    get_width="0.5 + trips_normalized * 9",
    get_source_position=["start_lon", "start_lat"],
    get_target_position=["end_lon", "end_lat"],
    get_tilt=15,
    get_source_color="[0, 255, 0, 40 + trips_normalized * 215]",
    get_target_color="[0, 150, 255, 40 + trips_normalized * 215]",
    pickable=True,
    auto_highlight=True,
)

view_state = pdk.ViewState(latitude=41.1493, longitude=-8.6111, bearing=45, pitch=65, zoom=10.5,)

TOOLTIP_TEXT = {"html": "{trips} trips <br /> Start of the trip in green; end of the trip in blue"}
pdk.Deck(arc_layer, initial_view_state=view_state, tooltip=TOOLTIP_TEXT)


In [None]:
unique_hexes = set(taxi_trips_h3['start_hex'].unique()).union(taxi_trips_h3['end_hex'].unique())
len(unique_hexes)

In [None]:
coordinates = [h3.cell_to_latlng(h3_cell)[::-1] for h3_cell in unique_hexes]
coordinates[:10]

In [None]:
unique_points = gpd.GeoDataFrame(geometry=gpd.GeoSeries.from_xy(*zip(*coordinates)), crs='EPSG:4326')
unique_points.explore()

In [None]:
from srai.regionalizers import AdministrativeBoundaryRegionalizer

In [None]:
portugal_regionalizer = AdministrativeBoundaryRegionalizer(admin_level=7, clip_regions=False)

municipalities = portugal_regionalizer.transform(unique_points)
municipalities

In [None]:
municipalities.explore()

In [None]:
trip_h3_resolution = h3.get_resolution(taxi_trips_h3['start_hex'].iloc[0])
trip_h3_resolution

In [None]:
neighbours_distance = 5

In [None]:
from srai.regionalizers import H3Regionalizer

In [None]:
portugal_h3_regions = H3Regionalizer(resolution=trip_h3_resolution).transform(municipalities)
portugal_h3_regions

In [None]:
from srai.loaders.osm_loaders import OSMPbfLoader
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.h3 import ring_buffer_h3_regions_gdf

In [None]:
buffered_regions = ring_buffer_h3_regions_gdf(regions_gdf=portugal_h3_regions, distance=neighbours_distance)
buffered_area = buffered_regions.unary_union
gpd.GeoSeries([buffered_area], crs='EPSG:4326').explore()

In [None]:
loader = OSMPbfLoader()

portugal_features = loader.load(buffered_area, GEOFABRIK_LAYERS)
portugal_features

In [None]:
portugal_features[portugal_features.geom_type != 'Point'].plot()

In [None]:
from srai.joiners import IntersectionJoiner

In [None]:
portugal_joint_features = IntersectionJoiner().transform(buffered_regions, portugal_features)
portugal_joint_features

In [None]:
from srai.embedders import GeoVexEmbedder

In [None]:
geovex_embedder = GeoVexEmbedder(
    target_features=GEOFABRIK_LAYERS,
    embedding_size=50,
    batch_size=128,
    neighbourhood_radius=neighbours_distance,
)

In [None]:
from srai.neighbourhoods import H3Neighbourhood

In [None]:
import torch

In [None]:
portugal_h3_neighbourhood = H3Neighbourhood(buffered_regions)
portugal_embeddings = geovex_embedder.fit_transform(
    buffered_regions,
    portugal_features,
    portugal_joint_features,
    portugal_h3_neighbourhood,
    trainer_kwargs={
        "max_epochs": 5,
        "accelerator": (
            "cpu" if torch.backends.mps.is_available() else "auto"
        ),  # GeoVexEmbedder does not support MPS
    },
)
portugal_embeddings = portugal_embeddings.loc[portugal_h3_regions.index]
portugal_embeddings

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

pca_embeddings = pca.fit_transform(portugal_embeddings)
# make the embeddings into a dataframe
pca_embeddings = pd.DataFrame(pca_embeddings, index=portugal_embeddings.index)

# convert to RGB
pca_embeddings = (
    (pca_embeddings - pca_embeddings.min())
    / (pca_embeddings.max() - pca_embeddings.min())
    * 255
).astype(int)

# make the rgb array into a string
pca_embeddings["rgb"] = pca_embeddings.apply(
    lambda row: f"rgb({row[0]}, {row[1]}, {row[2]})", axis=1
)

# porto_regions = portugal_h3_regions[
#     portugal_h3_regions.intersects(
#         municipalities.loc[["Porto", "Vila Nova de Gaia", "Matosinhos"]].unary_union
#     )
# ]

color_dict = dict(
    enumerate(portugal_h3_regions.index.map(pca_embeddings["rgb"].to_dict()).to_list())
)
portugal_h3_regions.reset_index().reset_index().explore(
    column="index",
    tooltip="region_id",
    tiles="CartoDB positron",
    legend=False,
    cmap=lambda x: color_dict[x],
    style_kwds=dict(color="#444", opacity=0.0, fillOpacity=0.5),
)

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(14,9))
ax = fig.add_subplot(111, 
                     projection='3d')
 
for idx in pca_embeddings.index:
    ax.scatter(pca_embeddings.loc[idx][0],
               pca_embeddings.loc[idx][1],
               pca_embeddings.loc[idx][2],
               s=60)

ax.set_xlabel("PC1", 
              fontsize=12)
ax.set_ylabel("PC2", 
              fontsize=12)
ax.set_zlabel("PC3", 
              fontsize=12)
 
ax.view_init(30, 125)
plt.title("3D PCA plot")
plt.show()

In [None]:
portugal_h3_index = portugal_embeddings.index

In [None]:
sum_of_trips_per_hex = taxi_trips_h3.groupby('start_hex')['trips'].sum()
sum_of_trips_per_hex.index.name = 'region_id'
sum_of_trips_per_hex

In [None]:
sum_of_trips_per_hex.max()

In [None]:
sum_of_trips_per_hex.quantile(0.99)

In [None]:
sum_of_trips_per_hex = sum_of_trips_per_hex.clip(0, sum_of_trips_per_hex.quantile(0.99))

In [None]:
sum_of_trips_per_hex.hist()

In [None]:
from srai.plotting import plot_numeric_data
plot_numeric_data(portugal_h3_regions, "trips", pd.DataFrame(sum_of_trips_per_hex))

In [None]:
from xgboost import XGBRegressor, XGBClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error

In [None]:
X_trips = portugal_embeddings.loc[sum_of_trips_per_hex.index].values
y_trips = list(sum_of_trips_per_hex.values)

In [None]:
X_trips_train, X_trips_test, y_trips_train, y_trips_test = train_test_split(
    X_trips, y_trips, test_size=0.2, random_state=42
)

In [None]:
X_trips_binary = portugal_embeddings.loc[portugal_h3_index].values
y_trips_binary = [
    1 if h3_index in sum_of_trips_per_hex.index else 0 for h3_index in portugal_h3_index
]

In [None]:
X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(
    X_trips_binary, y_trips_binary, test_size=0.2, random_state=42
)

In [None]:
binary_classifier = XGBClassifier(n_estimators=1000)
binary_classifier.fit(X_binary_train, y_binary_train)
y_binary_pred = binary_classifier.predict(X_binary_test)

print(classification_report(y_binary_test, y_binary_pred))

In [None]:
trips_regressor = XGBRegressor(n_estimators=1000)
trips_regressor.fit(X_trips_train, y_trips_train)
y_trips_pred = trips_regressor.predict(X_trips_test)

print(
    r2_score(y_trips_test, y_trips_pred),
    mean_absolute_error(y_trips_test, y_trips_pred),
    mean_absolute_percentage_error(y_trips_test, y_trips_pred),
)

In [None]:
plot_numeric_data(portugal_h3_regions, 0, pd.DataFrame(trips_regressor.predict(X_trips), index=pd.Index(sum_of_trips_per_hex.index, name='region_id')))

In [None]:
portugal_h3_regions['trip_binary'] = [
    1 if h3_index in sum_of_trips_per_hex.index else 0 for h3_index in portugal_h3_regions.index
]
portugal_h3_regions.plot('trip_binary')

In [None]:
portugal_h3_regions['trip_binary_predicted'] = binary_classifier.predict(portugal_embeddings.loc[portugal_h3_regions.index].values)
portugal_h3_regions.plot('trip_binary_predicted')

In [None]:
from typing import Any, List

from torch import nn
from pytorch_lightning import LightningModule

def weighted_mse_loss(input, target, weight):
    return torch.sum(weight * (input - target) ** 2)

class TripPredictorModel(LightningModule):
    def __init__(self) -> None:
        super().__init__()
        self.nn_model = nn.Sequential(
            nn.Linear(50, 50),
            nn.Sigmoid(),
            nn.Linear(50, 50),
            nn.Sigmoid(),
            nn.Linear(50, 50),
        )

    def forward(self, x: "torch.Tensor") -> "torch.Tensor":
        """
        Forward pass.

        Args:
            x (torch.Tensor): Input tensor.
        """
        embedding: "torch.Tensor" = self.nn_model(x)
        return embedding

    def configure_optimizers(self) -> "torch.optim.Optimizer":
        """Configure optimizer."""

        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

    def training_step(self, batch: List["torch.Tensor"], batch_idx: Any) -> "torch.Tensor":
        """
        Training step.

        Args:
            batch (torch.Tensor): Batch.
            batch_idx (Any): Batch index.
        """
        x, y, weight = batch
        y_pred = self.nn_model(x)
        loss = weighted_mse_loss(y_pred, y, weight)
        self.log("train_loss", loss, prog_bar=True)
        return loss

In [None]:
from torch.utils.data import Dataset


class TripsDataset(Dataset):
    def __init__(self, start_embeddings, end_embeddings, trips):
        self.start_embeddings = torch.Tensor(start_embeddings)
        self.end_embeddings = torch.Tensor(end_embeddings)
        self.trips = torch.Tensor(trips.reshape((len(trips), 1)))

    def __getitem__(self, index):
        x = self.start_embeddings[index]
        y = self.end_embeddings[index]
        weight = self.trips[index]
        return x, y, weight

    def __len__(self):
        return len(self.start_embeddings)

In [None]:
start_hexes = taxi_trips_h3['start_hex']
end_hexes = taxi_trips_h3['end_hex']
no_trips = taxi_trips_h3['trips']

trips_dataset = TripsDataset(
    start_embeddings=portugal_embeddings.loc[start_hexes].to_numpy(),
    end_embeddings=portugal_embeddings.loc[end_hexes].to_numpy(),
    trips=no_trips.to_numpy(),
)

In [None]:
from torch.utils.data import DataLoader
import pytorch_lightning as pl

trainer_kwargs = {
    # "max_epochs": 50, # uncomment for a longer training
    "max_epochs": 25,
    # "accelerator": "cpu",
}

dataloader = DataLoader(trips_dataset, batch_size=128, shuffle=True, num_workers=0)
trip_predictor_model = TripPredictorModel()



In [None]:
trainer = pl.Trainer(**trainer_kwargs)
trainer.fit(trip_predictor_model, dataloader)

In [None]:
from annoy import AnnoyIndex

In [None]:
def generate_trips(
    h3_embeddings_gdf,
    binary_classifier,
    trips_regressor,
    trip_destination_predictor,
    n = 100
):
    h3_index = h3_embeddings_gdf.index
    h3_embeddings = h3_embeddings_gdf.values
    predicted_trips_binary = binary_classifier.predict(h3_embeddings)
    predicted_trips_number = trips_regressor.predict(h3_embeddings).clip(min=0)
    predicted_destinations_embeddings = trip_destination_predictor(torch.Tensor(h3_embeddings)).detach().numpy()
    
    annoy_index = AnnoyIndex(h3_embeddings.shape[1], "angular")

    for idx in range(len(h3_index)):
        annoy_index.add_item(idx, h3_embeddings[idx])

    annoy_index.build(100, n_jobs=-1)
    
    trip_pairs = []
    for idx in tqdm(range(len(h3_index)), total=len(h3_index)):
        if predicted_trips_binary[idx] == 0:
            continue
        
        trips_number = predicted_trips_number[idx]
        
        if trips_number == 0:
            continue

        trip_destination_embedding = predicted_destinations_embeddings[idx]
        nearest_neighbours_ids, distances = annoy_index.get_nns_by_vector(
            trip_destination_embedding,
            n=n,
            include_distances=True,
        )
        nearest_neighbours_h3s = h3_index[nearest_neighbours_ids]
        weights = [1 - distance for distance in distances]
        total_weight = sum(weights)
        
        start_index = h3_index[idx]
        for nearest_neighbour, weight in zip(nearest_neighbours_h3s, weights):
            trip_pairs.append(
                dict(start_hex=start_index, end_hex=nearest_neighbour, trips=trips_number * weight / total_weight)
            )
            
    predicted_trips = pd.DataFrame(trip_pairs)
    predicted_trips["start_point"] = predicted_trips["start_hex"].apply(h3.cell_to_latlng)
    predicted_trips["end_point"] = predicted_trips["end_hex"].apply(h3.cell_to_latlng)
    predicted_trips["start_lat"], predicted_trips["start_lon"] = zip(
        *predicted_trips["start_point"]
    )
    predicted_trips["end_lat"], predicted_trips["end_lon"] = zip(
        *predicted_trips["end_point"]
    )
    predicted_trips["trips_normalized"] = (
        (predicted_trips["trips"] - predicted_trips["trips"].min())
        / (predicted_trips["trips"].max() - predicted_trips["trips"].min())
    )
    return predicted_trips
    

In [None]:
predicted_trips = generate_trips(portugal_embeddings, binary_classifier, trips_regressor, trip_predictor_model, n=10)
predicted_trips

In [None]:
arc_layer = pdk.Layer(
    "ArcLayer",
    data=predicted_trips,
    get_width="0.5 + trips_normalized * 9",
    get_source_position=["start_lon", "start_lat"],
    get_target_position=["end_lon", "end_lat"],
    get_tilt=15,
    get_source_color="[0, 255, 0, 40 + trips_normalized * 215]",
    get_target_color="[0, 150, 255, 40 + trips_normalized * 215]",
    pickable=True,
    auto_highlight=True,
)

view_state = pdk.ViewState(latitude=41.1493, longitude=-8.6111, bearing=45, pitch=65, zoom=10.5,)

TOOLTIP_TEXT = {"html": "Predicted trips {trips} <br /> Start of the trip in green; end of the trip in blue"}
pdk.Deck(arc_layer, initial_view_state=view_state, tooltip=TOOLTIP_TEXT)

In [None]:
from srai.regionalizers import geocode_to_region_gdf

In [None]:
warsaw_region = geocode_to_region_gdf("Warsaw, PL")
warsaw_h3_regions = H3Regionalizer(resolution=trip_h3_resolution).transform(warsaw_region)

buffered_warsaw_regions = ring_buffer_h3_regions_gdf(regions_gdf=warsaw_h3_regions, distance=neighbours_distance)
buffered_warsaw_area = buffered_warsaw_regions.unary_union

warsaw_features = loader.load(buffered_warsaw_area, GEOFABRIK_LAYERS)

warsaw_joint_features = IntersectionJoiner().transform(buffered_warsaw_regions, warsaw_features)

warsaw_embeddings = geovex_embedder.transform(
    buffered_warsaw_regions,
    warsaw_features,
    warsaw_joint_features,
)
warsaw_embeddings = warsaw_embeddings.loc[warsaw_h3_regions.index]

predicted_warsaw_trips = generate_trips(warsaw_embeddings, binary_classifier, trips_regressor, trip_predictor_model, n=10)
predicted_warsaw_trips

In [None]:
arc_layer = pdk.Layer(
    "ArcLayer",
    data=predicted_warsaw_trips,
    get_width="0.5 + trips_normalized * 9",
    get_source_position=["start_lon", "start_lat"],
    get_target_position=["end_lon", "end_lat"],
    get_tilt=15,
    get_source_color="[0, 255, 0, 40 + trips_normalized * 215]",
    get_target_color="[0, 150, 255, 40 + trips_normalized * 215]",
    pickable=True,
    auto_highlight=True,
)

view_state = pdk.ViewState(latitude=52.2317, longitude=21.0064, bearing=45, pitch=65, zoom=10.5)

TOOLTIP_TEXT = {"html": "Predicted trips {trips} <br /> Start of the trip in green; end of the trip in blue"}
pdk.Deck(arc_layer, initial_view_state=view_state, tooltip=TOOLTIP_TEXT)