In [None]:
import geopandas as gpd
import pandas as pd

# Part 5
## Bicycle sharing system stations - analysis and transfer learning

### Task 1

Load dataset with *venturilo* bike stations (`data/veturilo_stations.json`) and convert *lat/lon* into a geometry column. Save it to `stations_gdf` variable

In [None]:
data_path = '../../data/veturilo_stations.json'

stations_gdf = ...

### BEGIN SOLUTION
stations_raw = pd.read_json(data_path)
stations_gdf = gpd.GeoDataFrame(
    stations_raw,
    geometry=gpd.GeoSeries.from_xy(stations_raw["lon"], stations_raw["lat"]),
    crs="EPSG:4326",
)
### END SOLUTION

stations_gdf.head()

Downloading area of Warsaw in preparation for features download. Visualization of station location on the map

In [None]:
from srai.regionalizers import geocode_to_region_gdf

warsaw_region = geocode_to_region_gdf("Warsaw, PL")
m = warsaw_region.explore(tooltip=False, highlight=False, style_kwds={"fillOpacity": 0.3})
stations_gdf.explore(m=m, color="red")

### Task 2
Split the area of Warsaw into regions, for which we will be predicting stations location

In this example we use H3 hierachical index and split the area into hexagons of size 9 (approx 500m in diameter)

In [None]:
from srai.plotting import plot_regions
from srai.regionalizers import H3Regionalizer

regions_gdf = ...

### BEGIN SOLUTION
regions_gdf = H3Regionalizer(resolution=9).transform(warsaw_region)
### END SOLUTION

plot_regions(regions_gdf)

### Task 3

Download the OSM tags which will be used to predict bicycle stations locations. For this case, `OSMPbfLoader` will work the best

We recommend the predefined `GEOFABRIK_LAYERS` filter, since it covers a wide range of different tags. But be honest, remove `{"shopping": "amenity=bicycle_rental"}` tag ;)

In [None]:
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS
from srai.loaders import OSMPbfLoader

features_gdf = ...

### BEGIN SOLUTION
features_gdf = OSMPbfLoader().load(warsaw_region, GEOFABRIK_LAYERS)
features_gdf = features_gdf[features_gdf["shopping"] != "amenity=bicycle_rental"]
### END SOLUTION

features_gdf.head()

Our features have not been associated with regions yet. We can use an *intersects* predicate and associate them with regions.

In [None]:
from srai.joiners import IntersectionJoiner

joined_features = IntersectionJoiner().transform(regions_gdf, features_gdf)
joined_features

### Task 4

We have already accociated OSM features with regions. To train our model we have to join station locations with regions as well. Write the code which finds regions intersecting with station locations. Use those information to select positive and negative samples for classifier training (regions with and without stations). Remember that we wiil have to train model based on that, so make sure to do any neccessary undersampling to balance our training data

In [None]:
positive_samples = ...
negative_samples = ...

### BEGIN SOLUTION
# First, join bike stations locations with regions, using `IntersectionJoiner`
bikes_joint = IntersectionJoiner().transform(regions_gdf, stations_gdf)

# For future visualizations, we will need to restore geometry column
positive_samples = regions_gdf.join(bikes_joint, how="inner")
positive_samples = positive_samples.reset_index().drop(columns=["feature_id"]).groupby("region_id").agg("first")  # this one is to remove duplicates
positive_samples = positive_samples.reset_index().set_index("region_id")
positive_samples["is_positive"] = True

# Mark remaining regions as negative
negative_samples = regions_gdf.copy()
negative_samples["is_positive"] = False
negative_samples.loc[positive_samples.index, "is_positive"] = True
negative_samples = negative_samples[~negative_samples["is_positive"]]

# Just to keep everything balanced - undersampling
negative_samples = negative_samples.sample(n=3 * len(positive_samples), random_state=42)
### END SOLUTION


train_data = pd.concat([positive_samples, negative_samples])
train_data.explore("is_positive", cmap="cividis", zoom_start=13, tiles="CartoDB positron")

Let's create embeddings for each region in our city (embeddings for outside of training data will be used for visualizations). Those will serve as our *Xs* for training, and *Ys* will be binary value if station is in the area or not

In [None]:
from srai.embedders import ContextualCountEmbedder
from srai.neighbourhoods import H3Neighbourhood

embedder = ContextualCountEmbedder(
    neighbourhood=H3Neighbourhood(),
    neighbourhood_distance=5,
    concatenate_vectors=True,
    expected_output_features=GEOFABRIK_LAYERS,
)
embeddings = embedder.transform(
    regions_gdf=regions_gdf, features_gdf=features_gdf, joint_gdf=joined_features
)
X = embeddings.loc[train_data.index].to_numpy()
Y = train_data["is_positive"].astype(int).to_numpy()

### Task 5

Select your favourite model and train a classifier for station locations

In [None]:
from sklearn.metrics import classification_report

### BEGIN SOLUTION
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)


classifier = SVC(probability=True)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
Y_pred_proba = classifier.predict_proba(X_test)
### END SOLUTION


print(classification_report(Y_test, Y_pred))

### Task 6

Run predictions for all regions and prepare visualization on the map

In [None]:
from srai.plotting import plot_numeric_data

### BEGIN SOLUTION
station_probas = classifier.predict_proba(embeddings.to_numpy())
regions_gdf["station_proba"] = station_probas[:, 1]
m = plot_numeric_data(regions_gdf, "station_proba", colormap="Spectral_r", opacity=0.5)
stations_gdf.explore(m=m, color='black')
### END SOLUTION


### Final task - transfer learning

Now we have a model, which was trained on data from Warsaw. Select some other city, and run predictions on it. Let's see where to put BSS stations there

In [None]:
### BEGIN SOLUTION

# Select area
wroclaw_region = geocode_to_region_gdf('Wrocław, PL')

# Split into regions
wroclaw_regions_gdf = H3Regionalizer(resolution=9).transform(wroclaw_region)

# Load OSM features (the same as for model training). We will also save stations location for visualization later
wroclaw_features_gdf = OSMPbfLoader().load(wroclaw_region,GEOFABRIK_LAYERS)
wroclaw_stations = wroclaw_features_gdf[wroclaw_features_gdf["shopping"] == "amenity=bicycle_rental"]
wroclaw_features_gdf = wroclaw_features_gdf[wroclaw_features_gdf["shopping"] != "amenity=bicycle_rental"]

# Get embeddings for regions
wroclaw_joined_features = IntersectionJoiner().transform(wroclaw_regions_gdf, wroclaw_features_gdf)
wroclaw_embeddings = embedder.transform(
    regions_gdf=wroclaw_regions_gdf,
    features_gdf=wroclaw_features_gdf,
    joint_gdf=wroclaw_joined_features,
)

# Predict and visualize
station_probas_wro = classifier.predict_proba(wroclaw_embeddings.to_numpy())

wroclaw_regions_gdf["station_proba"] = station_probas_wro[:, 1]
m = plot_numeric_data(wroclaw_regions_gdf, "station_proba", colormap="Spectral_r", opacity=0.5)

wroclaw_stations.explore(m=m, color='black')

### END SOLUTION
