# Application: Creating a Telco Mobility Index

1. Add total travel distance
2. Add radius of gyration
3. Add activity entropy

In [1]:
import os
os.chdir("../")

In [2]:
import random
import shapely
import pendulum
import numpy as np
import pandas as pd
from sds4gdsp.processor import convert_cel_to_point, calc_haversine_distance
from sds4gdsp.plotter import get_route_fig, load_images, plot_images
from IPython.display import HTML, display
from functools import reduce

Load, take a peek, and get a gist of the given datasets

a. Fake Subscribers <br>
b. Fake Cellsites <br>
c. Fake Transactions

In [3]:
filepath_subscribers = "data/fake_subscribers.csv"
dtype = dict(
    gender="category",
    age=int,
    name=str,
    chi_indicator=bool,
    ewallet_user_indicator="category"
)
fake_subscribers = pd.read_csv(filepath_subscribers, dtype=dtype)
fake_subscribers.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator
67,glo-sub-068,female,45,Mrs. Victoria Ruiz,False,N
79,glo-sub-080,female,48,Molly Vasquez,False,N
24,glo-sub-025,male,49,Robert Campbell,True,N
11,glo-sub-012,male,52,Brandon Coleman,False,N
94,glo-sub-095,female,42,Cheryl Smith,True,N


In [4]:
fake_subscribers.shape

(100, 6)

In [5]:
cat_cols = ["gender", "chi_indicator", "ewallet_user_indicator"]
fs_breakdown = fake_subscribers.groupby(cat_cols).size().reset_index(name="cnt")
fs_breakdown.assign(pcnt=fs_breakdown.cnt.div(len(fake_subscribers)).mul(100).round(2))

Unnamed: 0,gender,chi_indicator,ewallet_user_indicator,cnt,pcnt
0,female,False,N,14,14.0
1,female,False,Y,12,12.0
2,female,True,N,10,10.0
3,female,True,Y,14,14.0
4,male,False,N,12,12.0
5,male,False,Y,17,17.0
6,male,True,N,14,14.0
7,male,True,Y,7,7.0


In [6]:
filepath_cellsites = "data/fake_cellsites.csv"
fake_cellsites = pd.read_csv(filepath_cellsites)
fake_cellsites.sample(5)

Unnamed: 0,cel_uid,coords
22,glo-cel-023,POINT (121.0474636 14.4854434)
57,glo-cel-058,POINT (121.05919 14.5303819)
69,glo-cel-070,POINT (121.0648755 14.5124658)
77,glo-cel-078,POINT (121.0695979 14.5168694)
74,glo-cel-075,POINT (121.0683106 14.5054757)


In [7]:
fake_cellsites.shape

(111, 2)

In [8]:
HTML('<img src="../docs/fake_cellsites.png" width="600" height="600"/>')

In [9]:
filepath_transactions = "data/fake_transactions.csv"
fake_transactions = pd.read_csv(filepath_transactions)

In [10]:
fake_transactions.shape

(15090, 5)

In [11]:
min_date = pendulum.parse(fake_transactions.transaction_dt.min(), exact=True)
max_date = pendulum.parse(fake_transactions.transaction_dt.max(), exact=True)
period = pendulum.period(min_date, max_date)

In [12]:
sample_dt = str(random.sample(list(period), 1)[0])
filter_dt = fake_transactions.transaction_dt == sample_dt
sample_sub = fake_subscribers.sub_uid.sample(1).item()
filter_sub = fake_transactions.sub_uid == sample_sub
fake_transactions.loc[filter_sub&filter_dt]

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr
2986,glo-txn-02987,glo-sub-020,glo-cel-028,2023-06-22,3
2987,glo-txn-02988,glo-sub-020,glo-cel-026,2023-06-22,11
2988,glo-txn-02989,glo-sub-020,glo-cel-028,2023-06-22,14
2989,glo-txn-02990,glo-sub-020,glo-cel-026,2023-06-22,17
2990,glo-txn-02991,glo-sub-020,glo-cel-033,2023-06-22,21
2991,glo-txn-02992,glo-sub-020,glo-cel-026,2023-06-22,23


Create a helper function to help fetch subscriber trajectory

In [13]:
def get_sub_traj(
    sub: str,
    date: str,
    window: str,
    transactions: pd.DataFrame,
    cellsites: pd.DataFrame
):
    if window=="month":
        date_filter = transactions.transaction_dt.apply(lambda d: pendulum.parse(d, exact=True).start_of("month").to_date_string())==date
    elif window=="day":
        date_filter = transactions.transaction_dt==date
    sub_filter = transactions.sub_uid==sub
    transactions_red = transactions.loc[sub_filter&date_filter]
    transactions_red = transactions_red.merge(cellsites, on="cel_uid")
    return transactions_red.sort_values(by=["transaction_dt", "transaction_hr"], ascending=[1, 1])

In [14]:
get_sub_traj("glo-sub-001", "2023-06-01", "month", fake_transactions, fake_cellsites)

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-00001,glo-sub-001,glo-cel-053,2023-06-01,3,POINT (121.0581692 14.5056112)
1,glo-txn-00002,glo-sub-001,glo-cel-044,2023-06-01,15,POINT (121.0550896 14.5056935)
5,glo-txn-00003,glo-sub-001,glo-cel-035,2023-06-01,20,POINT (121.0520115 14.5087523)
2,glo-txn-00004,glo-sub-001,glo-cel-044,2023-06-01,21,POINT (121.0550896 14.5056935)
6,glo-txn-00005,glo-sub-001,glo-cel-035,2023-06-01,23,POINT (121.0520115 14.5087523)
...,...,...,...,...,...,...
115,glo-txn-00141,glo-sub-001,glo-cel-056,2023-06-30,6,POINT (121.0591257 14.5116111)
43,glo-txn-00142,glo-sub-001,glo-cel-051,2023-06-30,7,POINT (121.0572633 14.5141269)
29,glo-txn-00143,glo-sub-001,glo-cel-049,2023-06-30,10,POINT (121.0568161 14.5177119)
103,glo-txn-00144,glo-sub-001,glo-cel-057,2023-06-30,19,POINT (121.059168 14.5199865)


We'll use this scoring base moving forward

In [15]:
scoring_base = fake_subscribers.copy()

## 1: Total Travel Distance

In [16]:
def calc_total_travel_distance(coords):
    od_pairs = list(zip(coords, coords[1:]))
    total_travel_distance = reduce(
        lambda a, b: a + b,
        list(map(lambda p: calc_haversine_distance(*p), od_pairs))
    )
    return total_travel_distance

Check total travel distance for a sample sub trajectory

In [17]:
sub = fake_subscribers.sample(1).sub_uid.item()
date = "2023-06-01"
window = "month"

traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)

In [18]:
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-11885,glo-sub-080,glo-cel-036,2023-06-01,3,POINT (121.052231 14.519476)
10,glo-txn-11886,glo-sub-080,glo-cel-027,2023-06-01,5,POINT (121.0482758 14.5207167)
12,glo-txn-11887,glo-sub-080,glo-cel-021,2023-06-01,10,POINT (121.0461939 14.5245073)
13,glo-txn-11888,glo-sub-080,glo-cel-030,2023-06-01,11,POINT (121.0493484 14.5234169)
1,glo-txn-11889,glo-sub-080,glo-cel-036,2023-06-01,17,POINT (121.052231 14.519476)


In [19]:
coords = traj.coords.tolist()
od_pairs = list(zip(coords, coords[1:]))
print(str(od_pairs[:3]).replace("[", "").replace("]", ""))

('POINT (121.052231 14.519476)', 'POINT (121.0482758 14.5207167)'), ('POINT (121.0482758 14.5207167)', 'POINT (121.0461939 14.5245073)'), ('POINT (121.0461939 14.5245073)', 'POINT (121.0493484 14.5234169)')


In [20]:
# convert to kilometer
calc_total_travel_distance(coords) / 1_000

43.476894368709665

Scale to the whole scoring base

In [21]:
date = "2023-06-01"
window = "month"

total_travel_distances = []
for sub in scoring_base.sub_uid.tolist():
    traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
    coords = traj.coords.tolist()
    total_travel_distance = calc_total_travel_distance(coords)
    total_travel_distances.append(total_travel_distance)

scoring_base["total_travel_distance"] = total_travel_distances

In [22]:
scoring_base.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator,total_travel_distance
99,glo-sub-100,female,40,Taylor Bond,True,N,64041.203816
19,glo-sub-020,female,29,Gina Love,False,N,46809.759808
32,glo-sub-033,male,35,Antonio Pham,True,Y,45179.976307
95,glo-sub-096,male,39,Gary Casey,False,N,48555.121662
28,glo-sub-029,female,29,Maureen Estrada,True,Y,51488.415426


## 2: Radius of Gyration

In mobility analysis, the radius of gyration indicates the characteristic distance travelled by the agent (in our case, the telco mobile subscriber). This is computed using the given formula below:

$RoG$ = $\sqrt{\frac{1}{n} \sum \limits_{i=1}^{n} {dist(CoM,coord_i)^2}}$

$CoM$ = $\frac{1}{n} \sum \limits_{i=1}^{n} (lat_i, lng_i)$

Where: <br>
RoG is Radius of Gyration <br>
CoM is Center of Mass



In [23]:
def calc_radius_of_gyration(coords):
    
    # compute for the center of mass
    mean_lat = np.mean([shapely.wkt.loads(coord).y for coord in coords])
    mean_lng = np.mean([shapely.wkt.loads(coord).x for coord in coords])
    com = shapely.geometry.Point(mean_lng, mean_lat).wkt
    
    # compute for the distances from CoM to individual points
    pt_pairs = list(zip(coords, [com]*len(coords)))
    radius_of_gyration = np.sqrt(
        reduce(
            lambda a, b: a + b,
            list(map(lambda p: calc_haversine_distance(*p)**2, pt_pairs))
        ) / len(coords)
    )
    return radius_of_gyration

Check radius of gyration for a sample sub trajectory

In [24]:
sub = fake_subscribers.sample(1).sub_uid.item()
date = "2023-06-01"
window = "month"

traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)

In [25]:
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-06295,glo-sub-042,glo-cel-033,2023-06-01,0,POINT (121.050346 14.4970216)
23,glo-txn-06296,glo-sub-042,glo-cel-026,2023-06-01,3,POINT (121.0480144 14.4988026)
58,glo-txn-06297,glo-sub-042,glo-cel-031,2023-06-01,5,POINT (121.0496008 14.5017326)
1,glo-txn-06298,glo-sub-042,glo-cel-033,2023-06-01,7,POINT (121.050346 14.4970216)
24,glo-txn-06299,glo-sub-042,glo-cel-026,2023-06-01,18,POINT (121.0480144 14.4988026)


In [26]:
coords = traj.coords.tolist()

In [27]:
calc_radius_of_gyration(coords)

892.4026276047259

Scale to the whole scoring base

In [28]:
date = "2023-06-01"
window = "month"

radius_of_gyrations = []
for sub in scoring_base.sub_uid.tolist():
    traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
    coords = traj.coords.tolist()
    radius_of_gyration = calc_radius_of_gyration(coords)
    radius_of_gyrations.append(radius_of_gyration)

scoring_base["radius_of_gyration"] = radius_of_gyrations

In [29]:
scoring_base.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator,total_travel_distance,radius_of_gyration
61,glo-sub-062,female,25,Stephanie Brady,False,Y,59114.871368,1062.212106
10,glo-sub-011,male,42,Samuel Donaldson,True,Y,80514.160201,1894.730225
66,glo-sub-067,male,49,Steven Cantrell,True,N,52810.103921,1450.136227
39,glo-sub-040,male,54,David Watson,False,Y,53668.763944,951.521961
7,glo-sub-008,male,52,Stephen Irwin,True,N,46088.455278,601.75519


## 3: Activity Entropy

## PENDING ITEMS

1. Redo the fake data simulation, make stay proba variable instead of fixed
2. Add visuals on total travel distance, radius of gyration, activity entropy
3. Finish lecture part 1, grammar of spatial data science

## ARCHIVE

In [14]:
def get_route_fig(r):
    fig, ax = plt.subplots(1, 1)
    gpd.GeoSeries(r).plot(ax=ax, linewidth=5, zorder=1)
    orig = shapely.geometry.Point([r.xy[0][0], r.xy[1][0]])
    dest = shapely.geometry.Point([r.xy[0][-1], r.xy[1][-1]])
    gpd.GeoSeries(orig).plot(ax=ax, color="red", markersize=250, zorder=2, alpha=0.8)
    gpd.GeoSeries(dest).plot(ax=ax, color="green", markersize=250, zorder=2, alpha=0.8)
    plt.axis("off")
    ax.ticklabel_format(useOffset=False)
    plt.close()
    return fig

import os
import shapely
import geopandas as gpd
import matplotlib.pyplot as plt
from PIL import Image
import shutil

In [15]:
# day in a life of a sub
d = fake_transactions.copy()
sample_sub = d.sample(1).sub_id.item()
d = d.loc[d.sub_id==sample_sub]
days = d.transaction_dt.unique().tolist()
route_figs = []
for sample_day in days:
    sites = fake_transactions\
        .loc[fake_transactions.sub_id==sample_sub]\
        .loc[fake_transactions.transaction_dt==sample_day]\
        .cel_id.tolist()
    points = list(map(lambda z: convert_cel_to_point(z, fake_cellsites), sites))
    r = shapely.geometry.LineString(points)
    route_figs.append(get_route_fig(r))
os.mkdir("../sample/")
for idx, fig in enumerate(route_figs):
    fname = "../sample/{}_tmp.jpg".format(idx+1)
    fig.savefig(fname)
imgs = load_images("../sample")
plot_images(imgs)
shutil.rmtree("../sample")

AttributeError: 'DataFrame' object has no attribute 'uid'