Author: Luca Pappalardo
</br>Geospatial Analytics, Master degree in Data Science and Business Informatics, University of Pisa

# Geospatial Analytics - Lesson 7: Flow generation

## Train a Gravity model in San Francisco to predict flows in Denver

1. Download checkin data and create a `TrajDataFrame`
1. Create square tessellations for the two cities
1. Compute the relevance of each tile
1. Create `FlowDataFrame`s aggregating `TrajDataFrame`s 
5. Compute the total number of trips from each tile
6. Fit a singly-constrained Gravity using trips in San Francisco
7. Use the fitted model to predict flows in Denver


8. **Qualitative evaluation**: visualise the performance of the model against a baseline random model
9. **Quantitative evaluation**: compute performance metrics

## The Gravity model of human mobility

$T_{ij} \propto \frac{pop_i^{\alpha_1} \cdot pop_j^{\alpha_2}}{r_{ij}^\beta}$

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import skmob
from skmob.utils import utils, constants
from skmob.tessellation import tilers
from skmob.utils.plot import plot_gdf

import numpy as np
import pandas as pd
import geopandas as gpd
import shapely
import folium
from folium.plugins import HeatMap
import matplotlib as mpl
import matplotlib.pyplot as plt

### Create a `TrajDataFrame` from <a href="https://snap.stanford.edu/data/loc-brightkite.html">Brightkite</a> checkins

In [None]:
# first download checkins using pandas
url = "https://snap.stanford.edu/data/loc-brightkite_totalCheckins.txt.gz"
#url = 'data/loc-brightkite_totalCheckins.txt.gz'
df = pd.read_csv(url, sep='\t', header=0, nrows=100000, 
                 names=['user', 'check-in_time', 'latitude', 'longitude', 'location id'])

In [None]:
# convert the DataFrame into a TrajDataFrame
tdf = skmob.TrajDataFrame(df, latitude='latitude', longitude='longitude', datetime='check-in_time', user_id='user')
tdf.crs = 'epsg:4326'
print('number of rows: %s' %len(tdf))
print(type(tdf))
tdf.head()

In [None]:
tdf.plot_trajectory(max_users=10, max_points=1000, zoom=4, start_end_markers=False)

In [None]:
m = folium.Map(tiles='openstreetmap', zoom_start=12, control_scale=True)
HeatMap(tdf[:50000][['lat', 'lng']].values).add_to(m)
m

### Create square tessellations

#### Training city: San Francisco

In [None]:
from skmob.utils.plot import plot_gdf

In [None]:
# it retrieve information from the web
tess_train = tilers.tiler.get("squared",  
                              base_shape="San Francisco, California", 
                              meters=2500)
len(tess_train)

In [None]:
plot_gdf(tess_train, zoom=10)

#### Test city: Denver

In [None]:
tess_test = tilers.tiler.get("squared", meters=2500, 
                             base_shape="Denver, Colorado")
len(tess_test)

In [None]:
plot_gdf(tess_test, zoom=10)

### Compute the relevance of each tile


1. assign each point to the corresponding tile in San Francisco

In [None]:
tdf_tid = tdf.mapping(tess_train, remove_na=True)
tdf_tid.head(3)

2. compute the relevance of each tile in San Francisco

In [None]:
relevances = tdf_tid.groupby(by='tile_ID').count()[['lat']].rename(
    columns={'lat': 'relevance'})
relevances /= relevances.sum() # normalize

tess_train = tess_train.merge(relevances, right_index=True, left_on='tile_ID', how='left').fillna(0.)
tess_train.head(3)

Do the same for Denver

In [None]:
tdf_tid = tdf.mapping(tess_test, remove_na=True)
relevances = tdf_tid.groupby(by='tile_ID').count()[['lat']].rename(columns={'lat': 'relevance'})
# normalise
relevances /= relevances.sum()

tess_test = tess_test.merge(relevances, right_index=True, left_on='tile_ID', how='left').fillna(0.)
tess_test.head(3)

In [None]:
def define_colormap(tessellation, minval=1e-6):
    # define the colormap
    normc = mpl.colors.LogNorm(vmin=max(tessellation['relevance'].min(), minval), \
                               vmax=tessellation['relevance'].max())
    s_m = mpl.cm.ScalarMappable(cmap='jet', norm=normc)
    return s_m

def get_color(x):
    return mpl.colors.to_hex(s_m.to_rgba(x['relevance'] + 1e-12))

In [None]:
s_m = define_colormap(tess_train)
plot_gdf(tess_train, zoom=10, popup_features=['relevance'], \
         style_func_args={'color': get_color, 'fillColor' : get_color})

In [None]:
# the same for Denver
s_m = define_colormap(tess_test)
plot_gdf(tess_test, zoom=10, popup_features=['relevance'], \
         style_func_args={'color': get_color, 'fillColor' : get_color})

### Create `FlowDataFrame`s 

In [None]:
# for San Francisco
fdf_train = tdf.to_flowdataframe(tess_train, self_loops=False)
print(fdf_train['flow'].sum(), fdf_train['flow'].max())
fdf_train.head(4)

In [None]:
# for Denver
fdf_test = tdf.to_flowdataframe(tess_test, self_loops=False)
print(fdf_test['flow'].sum(), fdf_test['flow'].max())
fdf_test.head()

In [None]:
# plot flows in San Francisco
fdf_train.plot_flows(min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25)

In [None]:
# plot flows in Denver
fdf_test.plot_flows(min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25)

### Compute number of trips from each tile

In [None]:
# total outflows excluding self loops in San Francisco
tot_outflows = fdf_train[fdf_train['origin'] != fdf_train['destination']] \
    .groupby(by='origin', axis=0)[['flow']].sum().fillna(0).rename(columns={'flow': 'tot_outflow'})

if 'tot_outflow' not in tess_train.columns:
    tess_train = tess_train.merge(tot_outflows, right_index=True, left_on='tile_ID', how='left').fillna(0.).sort_values(by='tot_outflow', ascending=False)
tess_train.head()

In [None]:
# total outflows excluding self loops in Denver
tot_outflows = fdf_test[fdf_test['origin'] != fdf_test['destination']] \
    .groupby(by='origin', axis=0)[['flow']].sum().fillna(0).rename(columns={'flow': 'tot_outflow'})

if 'tot_outflow' not in tess_test.columns:
    tess_test = tess_test.merge(tot_outflows, right_index=True, left_on='tile_ID', how='left').fillna(0.).sort_values(by='tot_outflow', ascending=False)
tess_test.head()

### Fit a singly-constrained Gravity Model using trips in San Francisco

In [None]:
# import the Gravity class
from skmob.models.gravity import Gravity

The `Gravity` class has two public methods:
- `fit` fits the method parameters from data;
- `generate` generates the flows

#### Fit the gravity model's parameters


In [None]:
gravity_singly_fitted = Gravity(gravity_type='singly constrained')
print(gravity_singly_fitted)

In [None]:
gravity_singly_fitted.fit(fdf_train, relevance_column='relevance')
print(gravity_singly_fitted)

### Use the fitted model to predict the flows in Denver

In [None]:
np.random.seed(0)
sc_fdf_fitted = gravity_singly_fitted.generate(tess_test, 
                tile_id_column='tile_ID', 
                tot_outflows_column='tot_outflow', 
                relevance_column= 'relevance', out_format='flows')
sc_fdf_fitted.head(3)

In [None]:
sc_fdf_fitted.plot_flows(min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25)

In [None]:
denv_map = sc_fdf_fitted.plot_flows(min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25)
fdf_test.plot_flows(map_f=denv_map, min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25, flow_color='black')

### Qualitative evaluation
visualise the model's performance against a baseline
#### 1. Create a baseline model (without dependence on relevance and distance)

In [None]:
baseline = Gravity(gravity_type='singly constrained', 
                   deterrence_func_args=[0.], destination_exp=0.)
print(baseline)

In [None]:
np.random.seed(0)
baseline_fdf = baseline.generate(tess_test, 
                                   tile_id_column='tile_ID', 
                                   tot_outflows_column='tot_outflow', 
                                   relevance_column= 'relevance',
                                   out_format='flows')
baseline_fdf[:4]

In [None]:
baseline_fdf.plot_flows(min_flow=5, zoom=10, 
                        tiles='cartodbpositron', flow_weight=2, opacity=0.25)

In [None]:
denv_base_map = baseline_fdf.plot_flows(min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25)
fdf_test.plot_flows(map_f=denv_base_map, min_flow=5, zoom=10, tiles='cartodbpositron', flow_weight=2, opacity=0.25, flow_color='black')

#### Compare real flows against generated flows

In [None]:
xy = fdf_test.merge(sc_fdf_fitted, on=['origin', 'destination'])[['flow_x', 'flow_y']].values
xy_baseline = fdf_test.merge(baseline_fdf, on=['origin', 'destination'])[['flow_x', 'flow_y']].values

In [None]:
plt.plot(xy[:,0], xy[:,1], '.', label='Gravity')
plt.plot(xy_baseline[:,0], xy_baseline[:,1], '*', alpha=0.5, label='Baseline')
x = np.logspace(0, np.log10(np.max(xy)))
plt.plot(x, x, '--k')
plt.xlabel('Real flow'); plt.ylabel('Model flow')
plt.legend(loc = 'upper left')
plt.loglog(); plt.show()

### Quantitative evaluation metrics


In [None]:
from skmob.measures.evaluation import r_squared, mse, spearman_correlation, pearson_correlation, common_part_of_commuters, common_part_of_commuters_distance

In [None]:
metrics = [r_squared, mse, spearman_correlation, pearson_correlation, common_part_of_commuters, common_part_of_commuters_distance]
names = ['r_squared', 'mse', 'spearman_correlation', 'pearson_correlation', 'common_part_of_commuters', 'common_part_of_commuters_distance']

In [None]:
print('Metric:  Gravity - Baseline')
print('---------------------------')
for i, metric in enumerate(metrics):
    m = metric(xy[:, 0], xy[:, 1])
    b = metric(xy_baseline[:, 0], xy_baseline[:, 1])
    print("%s:   %s - %s" % (names[i], np.round(m, 3), np.round(b, 3)))

## Practice

Download from figshare this [flows dataset](https://figshare.com/collections/Inter-urban_interactions_of_mobility_via_cellular_position_tracking_in_the_southeast_Songliao_Basin_Northeast_China/4226183), create a tessellation and a `FlowDataFrame`; plot them together using skmob. Then: 
- split the `FlowDataFrame` into a training set and a test set; 
- train a `Gravity` model on the training set
- test the model's goodness on the test set (qualitative and quantitative evaluation)

In [None]:
import requests, zipfile, json, io
from shapely.geometry import Point

In [None]:
import pandas as pd
import geopandas as gpd
import skmob
import matplotlib.pyplot as plt

In [None]:
dataset_links = {
'positions' : 'https://figshare.com/ndownloader/files/14005292',
'flows' : 'https://figshare.com/ndownloader/files/14884442',
}

In [None]:
r = requests.get(dataset_links['positions'], stream=True)
print(r.text.replace('\r', '\n'), file=open('positions.csv','w'))

In [None]:
positions_df = pd.read_csv('positions.csv')
gdf = gpd.GeoDataFrame(positions_df, 
                       geometry=gpd.points_from_xy(positions_df['Longitude'], positions_df['Latitude'])).drop(['Longitude', 'Latitude'], axis=1).rename(columns={'Location': 'tile_ID'})
gdf['tile_ID'] = gdf['tile_ID'].astype('str')
gdf.crs = 'epsg:4326'
gdf.head()

In [None]:
r = requests.get(dataset_links['flows'], stream=True)
print(r.text, file=open('flows.csv','w'))

In [None]:
flows_df = pd.read_csv('flows.csv')
flows_df['Origin'].astype('str')
flows_df['Destination'].astype('str')
flows_df.head()

In [None]:
# total outflows excluding self loops in San Francisco
tot_outflows_df = flows_df[flows_df['Origin'] != flows_df['Destination']] \
    .groupby(by='Origin', axis=0)[['Weight']].sum().fillna(0).rename(columns={'Weight': 'tot_outflow'})
tot_outflows_df.index = tot_outflows_df.index.astype('str')
gdf = gdf.merge(tot_outflows_df, right_index=True, left_on='tile_ID', how='left').fillna(0.).sort_values(by='tot_outflow', ascending=False)
gdf['relevance'] = gdf['tot_outflow'] / gdf['tot_outflow'].sum()    
gdf.head()

In [None]:
fdf = skmob.FlowDataFrame(flows_df, 
                          origin='Origin', destination='Destination', flow='Weight', 
                          tile_id='tile_ID', tessellation=gdf)
fdf.head()

In [None]:
from skmob.models import gravity

In [None]:
gravity_singly = gravity.Gravity(gravity_type='singly constrained')
print(gravity_singly)

In [None]:
import numpy as np

In [None]:
np.random.seed(0)
gen_fdf = gravity_singly.generate(gdf, 
                tile_id_column='tile_ID', 
                tot_outflows_column='tot_outflow', 
                relevance_column= 'relevance', out_format='flows')
gen_fdf.head()

In [None]:
gen_fdf.plot_flows(min_flow=500)

In [None]:
baseline = gravity.Gravity(gravity_type='singly constrained', 
                   deterrence_func_args=[0.], destination_exp=0.)
print(baseline)

In [None]:
np.random.seed(0)
baseline_fdf = baseline.generate(gdf, 
                                   tile_id_column='tile_ID', 
                                   tot_outflows_column='tot_outflow', 
                                   relevance_column= 'relevance',
                                   out_format='flows')
baseline_fdf.head()

In [None]:
len(baseline_fdf)

In [None]:
xy = fdf.merge(gen_fdf, on=['origin', 'destination'])[['flow_x', 'flow_y']].values
xy_baseline = fdf.merge(baseline_fdf, on=['origin', 'destination'])[['flow_x', 'flow_y']].values

In [None]:
plt.plot(xy[:,0], xy[:,1], '.', label='Gravity')
plt.plot(xy_baseline[:,0], xy_baseline[:,1], '*', alpha=0.5, label='Baseline')
x = np.logspace(0, np.log10(np.max(xy)))
plt.plot(x, x, '--k')
plt.xlabel('Real flow'); plt.ylabel('Model flow')
plt.legend(loc = 'upper left')
plt.loglog(); plt.show()

In [None]:
from skmob.measures.evaluation import r_squared, mse, spearman_correlation, pearson_correlation, common_part_of_commuters, common_part_of_commuters_distance

In [None]:
metrics = [r_squared, mse, spearman_correlation, pearson_correlation, common_part_of_commuters, common_part_of_commuters_distance]
names = ['r_squared', 'mse', 'spearman_correlation', 'pearson_correlation', 'common_part_of_commuters', 'common_part_of_commuters_distance']

In [None]:
print('Metric:  Gravity - Baseline')
print('---------------------------')
for i, metric in enumerate(metrics):
    m = metric(xy[:, 0], xy[:, 1])
    b = metric(xy_baseline[:, 0], xy_baseline[:, 1])
    print("%s:   %s - %s" % (names[i], np.round(m, 3), np.round(b, 3)))

In [None]:
len(fdf)