In [1]:
import os
os.chdir("C:/Users/10012425/Desktop/sds4gdsp/")

In [2]:
import random
import shapely
import pendulum
import numpy as np
import pandas as pd
pd.options.display.max_rows=200
from sds4gdsp.processor import convert_cel_to_point, calc_haversine_distance
from sds4gdsp.plotter import get_route_fig, load_images, plot_images
from IPython.display import HTML, display
from functools import reduce

In [3]:
filepath_subscribers = "data/fake_subscribers.csv"
dtype = dict(
    gender="category",
    age=int,
    name=str,
    chi_indicator=bool,
    ewallet_user_indicator="category"
)
fake_subscribers = pd.read_csv(filepath_subscribers, dtype=dtype)
fake_subscribers.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator
45,glo-sub-046,male,24,Luis Jackson,True,Y
59,glo-sub-060,male,42,Philip Lewis,True,N
68,glo-sub-069,male,51,Kevin Gibson,True,N
80,glo-sub-081,female,48,Jessica Wood,True,Y
99,glo-sub-100,female,40,Kiara Salazar,True,N


In [4]:
fake_subscribers.shape

(100, 6)

In [5]:
cat_cols = ["gender", "chi_indicator", "ewallet_user_indicator"]
fs_breakdown = fake_subscribers.groupby(cat_cols).size().reset_index(name="cnt")
fs_breakdown.assign(pcnt=fs_breakdown.cnt.div(len(fake_subscribers)).mul(100).round(2))

Unnamed: 0,gender,chi_indicator,ewallet_user_indicator,cnt,pcnt
0,female,False,N,14,14.0
1,female,False,Y,12,12.0
2,female,True,N,10,10.0
3,female,True,Y,14,14.0
4,male,False,N,12,12.0
5,male,False,Y,17,17.0
6,male,True,N,14,14.0
7,male,True,Y,7,7.0


In [6]:
filepath_cellsites = "data/fake_cellsites.csv"
fake_cellsites = pd.read_csv(filepath_cellsites)
fake_cellsites.sample(5)

Unnamed: 0,cel_uid,coords
74,glo-cel-075,POINT (121.0683106 14.5054757)
108,glo-cel-109,POINT (121.0940274 14.5297068)
89,glo-cel-090,POINT (121.0770828 14.5317044)
84,glo-cel-085,POINT (121.0743555 14.5324571)
69,glo-cel-070,POINT (121.0648755 14.5124658)


In [7]:
fake_cellsites.shape

(111, 2)

In [8]:
HTML('<img src="../docs/fake_cellsites.png" width="600" height="600"/>')

In [9]:
filepath_transactions = "data/fake_transactions.csv"
fake_transactions = pd.read_csv(filepath_transactions)

In [10]:
fake_transactions.shape

(15090, 5)

In [11]:
min_date = pendulum.parse(fake_transactions.transaction_dt.min(), exact=True)
max_date = pendulum.parse(fake_transactions.transaction_dt.max(), exact=True)
period = pendulum.period(min_date, max_date)

In [12]:
sample_dt = str(random.sample(list(period), 1)[0])
filter_dt = fake_transactions.transaction_dt == sample_dt
sample_sub = fake_subscribers.sub_uid.sample(1).item()
filter_sub = fake_transactions.sub_uid == sample_sub
fake_transactions.loc[filter_sub&filter_dt]

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr
4868,glo-txn-04869,glo-sub-032,glo-cel-036,2023-06-23,4
4869,glo-txn-04870,glo-sub-032,glo-cel-040,2023-06-23,5
4870,glo-txn-04871,glo-sub-032,glo-cel-036,2023-06-23,13
4871,glo-txn-04872,glo-sub-032,glo-cel-040,2023-06-23,14
4872,glo-txn-04873,glo-sub-032,glo-cel-036,2023-06-23,16
4873,glo-txn-04874,glo-sub-032,glo-cel-027,2023-06-23,22


In [13]:
def get_sub_traj(
    sub: str,
    date: str,
    window: str,
    transactions: pd.DataFrame,
    cellsites: pd.DataFrame
):
    if window=="month":
        date_filter = transactions.transaction_dt.apply(lambda d: pendulum.parse(d, exact=True).start_of("month").to_date_string())==date
    elif window=="day":
        date_filter = transactions.transaction_dt==date
    sub_filter = transactions.sub_uid==sub
    transactions_red = transactions.loc[sub_filter&date_filter]
    transactions_red = transactions_red.merge(cellsites, on="cel_uid")
    return transactions_red.sort_values(by=["transaction_dt", "transaction_hr"], ascending=[1, 1])

In [14]:
get_sub_traj("glo-sub-001", "2023-06-01", "month", fake_transactions, fake_cellsites)

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-00001,glo-sub-001,glo-cel-082,2023-06-01,3,POINT (121.0732848 14.5244576)
5,glo-txn-00002,glo-sub-001,glo-cel-086,2023-06-01,15,POINT (121.0750697 14.521689)
11,glo-txn-00003,glo-sub-001,glo-cel-084,2023-06-01,20,POINT (121.0735688 14.5181671)
6,glo-txn-00004,glo-sub-001,glo-cel-086,2023-06-01,21,POINT (121.0750697 14.521689)
12,glo-txn-00005,glo-sub-001,glo-cel-084,2023-06-01,23,POINT (121.0735688 14.5181671)
13,glo-txn-00006,glo-sub-001,glo-cel-084,2023-06-02,3,POINT (121.0735688 14.5181671)
17,glo-txn-00007,glo-sub-001,glo-cel-089,2023-06-02,4,POINT (121.0765315 14.5160907)
20,glo-txn-00008,glo-sub-001,glo-cel-083,2023-06-02,8,POINT (121.0733576 14.5140961)
23,glo-txn-00009,glo-sub-001,glo-cel-077,2023-06-02,11,POINT (121.0693679 14.5119416)
21,glo-txn-00010,glo-sub-001,glo-cel-083,2023-06-02,12,POINT (121.0733576 14.5140961)


In [15]:
scoring_base = fake_subscribers.copy()

In [16]:
# for uniformity
date = "2023-06-01"
window = "month"

In [17]:
sub = fake_subscribers.sample(1).sub_uid.item()
traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-10400,glo-sub-070,glo-cel-028,2023-06-01,0,POINT (121.0485358 14.494388)
6,glo-txn-10401,glo-sub-070,glo-cel-026,2023-06-01,5,POINT (121.0480144 14.4988026)
15,glo-txn-10402,glo-sub-070,glo-cel-031,2023-06-01,7,POINT (121.0496008 14.5017326)
7,glo-txn-10403,glo-sub-070,glo-cel-026,2023-06-01,9,POINT (121.0480144 14.4988026)
1,glo-txn-10404,glo-sub-070,glo-cel-028,2023-06-01,12,POINT (121.0485358 14.494388)


In [18]:
coords = traj.coords.tolist()
od_pairs = list(zip(coords, coords[1:])) # think lag-1 in SQL

In [19]:
od_pairs[:4] # check with the traj table above

[('POINT (121.0485358 14.494388)', 'POINT (121.0480144 14.4988026)'),
 ('POINT (121.0480144 14.4988026)', 'POINT (121.0496008 14.5017326)'),
 ('POINT (121.0496008 14.5017326)', 'POINT (121.0480144 14.4988026)'),
 ('POINT (121.0480144 14.4988026)', 'POINT (121.0485358 14.494388)')]

In [20]:
total_travel_distance = reduce(
    lambda a, b: a + b,
    list(map(lambda p: calc_haversine_distance(*p), od_pairs))
)

In [21]:
# convert to kilometer
total_travel_distance / 1_000

56.54259860729366

In [22]:
def calc_total_travel_distance(traj):
    coords = traj.coords.tolist()
    od_pairs = list(zip(coords, coords[1:]))
    total_travel_distance = reduce(
        lambda a, b: a + b,
        list(map(lambda p: calc_haversine_distance(*p), od_pairs))
    )
    return total_travel_distance

In [23]:
total_travel_distances = []
for sub in scoring_base.sub_uid.tolist():
    traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
    total_travel_distance = calc_total_travel_distance(traj)
    total_travel_distances.append(total_travel_distance)

scoring_base["total_travel_distance"] = total_travel_distances

In [24]:
scoring_base.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator,total_travel_distance
53,glo-sub-054,male,25,Kelly Medina,True,Y,49803.383541
19,glo-sub-020,female,29,Angela Johnson,False,N,50951.423568
83,glo-sub-084,male,20,Charles Morgan,False,N,50126.644557
95,glo-sub-096,male,39,Brandon Charles,False,N,44938.12047
47,glo-sub-048,female,51,Gabrielle Vaughn,False,Y,46647.052089


In [25]:
sub = fake_subscribers.sample(1).sub_uid.item()
traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-00628,glo-sub-005,glo-cel-028,2023-06-01,0,POINT (121.0485358 14.494388)
6,glo-txn-00629,glo-sub-005,glo-cel-026,2023-06-01,2,POINT (121.0480144 14.4988026)
12,glo-txn-00630,glo-sub-005,glo-cel-033,2023-06-01,3,POINT (121.050346 14.4970216)
1,glo-txn-00631,glo-sub-005,glo-cel-028,2023-06-01,5,POINT (121.0485358 14.494388)
18,glo-txn-00632,glo-sub-005,glo-cel-015,2023-06-01,6,POINT (121.0424143 14.4939573)


In [26]:
coords = traj.coords.tolist()

In [27]:
mean_lat = np.mean([shapely.wkt.loads(coord).y for coord in coords])
mean_lng = np.mean([shapely.wkt.loads(coord).x for coord in coords])
com = shapely.geometry.Point(mean_lng, mean_lat).wkt

In [28]:
print(mean_lat, mean_lng, com)

14.512318452298851 121.05148582988507 POINT (121.05148582988507 14.512318452298851)


In [29]:
# compute for the distances from center of mass to each of the individual points
pt_pairs = list(zip(coords, [com]*len(coords))) # points-to-centerofmass
radius_of_gyration = np.sqrt(
    reduce(
        lambda a, b: a + b,
        list(map(lambda p: calc_haversine_distance(*p)**2, pt_pairs))
    ) / len(coords)
)

In [30]:
radius_of_gyration

1123.9534687501643

In [31]:
def calc_radius_of_gyration(traj):
    
    coords = traj.coords.tolist()
    
    # compute for the center of mass
    mean_lat = np.mean([shapely.wkt.loads(coord).y for coord in coords])
    mean_lng = np.mean([shapely.wkt.loads(coord).x for coord in coords])
    com = shapely.geometry.Point(mean_lng, mean_lat).wkt
    
    # compute for the distances from CoM to individual points
    pt_pairs = list(zip(coords, [com]*len(coords)))
    radius_of_gyration = np.sqrt(
        reduce(
            lambda a, b: a + b,
            list(map(lambda p: calc_haversine_distance(*p)**2, pt_pairs))
        ) / len(coords)
    )
    return radius_of_gyration

In [32]:
radius_of_gyrations = []
for sub in scoring_base.sub_uid.tolist():
    traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
    radius_of_gyration = calc_radius_of_gyration(traj)
    radius_of_gyrations.append(radius_of_gyration)

scoring_base["radius_of_gyration"] = radius_of_gyrations

In [33]:
scoring_base.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator,total_travel_distance,radius_of_gyration
91,glo-sub-092,female,21,Valerie Short,True,Y,66049.575613,1047.713163
55,glo-sub-056,male,25,Terry Gould,True,N,47942.556544,888.407175
2,glo-sub-003,male,59,Steven King,True,N,58861.062719,1723.534816
29,glo-sub-030,male,36,Gerald Harrison,False,N,57071.368021,1361.443078
36,glo-sub-037,male,60,Jeffrey Church,False,N,65522.669219,1302.643219


In [34]:
sub = fake_subscribers.sample(1).sub_uid.item()
traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-06898,glo-sub-046,glo-cel-025,2023-06-01,5,POINT (121.0476753 14.4772396)
3,glo-txn-06899,glo-sub-046,glo-cel-029,2023-06-01,6,POINT (121.0485949 14.4716356)
8,glo-txn-06900,glo-sub-046,glo-cel-024,2023-06-01,7,POINT (121.0476021 14.4654285)
11,glo-txn-06901,glo-sub-046,glo-cel-042,2023-06-01,10,POINT (121.0548302 14.4679851)
13,glo-txn-06902,glo-sub-046,glo-cel-047,2023-06-01,13,POINT (121.0561115 14.4711265)


In [35]:
loc_hrs = traj[["cel_uid", "transaction_hr"]].values.tolist()
od_loc_hrs = list(zip(loc_hrs, loc_hrs[1:]))

In [36]:
cels = list(set(traj.cel_uid.tolist()))

In [37]:
loc_hr_counter = dict(zip(cels, [0]*len(cels))) # THIS IS THE HASHMAP
for od_loc_hr in od_loc_hrs:
    orig = od_loc_hr[0]
    dest = od_loc_hr[1]
    cel = orig[0]
    if dest[1] > orig[1]:
        time_elapsed = dest[1] - orig[1]
    elif dest[1] < orig[1]:
        # next time elapsed jumps to another day
        time_elapsed = 24 - abs(dest[1] - orig[1])
    loc_hr_counter[cel] = loc_hr_counter[cel] + time_elapsed #  # I UPDATED THE HASHMAP

In [38]:
loc_hr_counter

{'glo-cel-040': 46,
 'glo-cel-050': 7,
 'glo-cel-035': 4,
 'glo-cel-054': 85,
 'glo-cel-057': 13,
 'glo-cel-029': 20,
 'glo-cel-061': 4,
 'glo-cel-056': 15,
 'glo-cel-047': 1,
 'glo-cel-052': 9,
 'glo-cel-062': 1,
 'glo-cel-043': 11,
 'glo-cel-051': 19,
 'glo-cel-038': 11,
 'glo-cel-036': 41,
 'glo-cel-042': 5,
 'glo-cel-044': 7,
 'glo-cel-041': 1,
 'glo-cel-024': 5,
 'glo-cel-063': 26,
 'glo-cel-046': 23,
 'glo-cel-039': 29,
 'glo-cel-049': 37,
 'glo-cel-055': 106,
 'glo-cel-048': 7,
 'glo-cel-025': 7,
 'glo-cel-045': 65,
 'glo-cel-067': 94,
 'glo-cel-064': 15}

In [39]:
traj # sanity check

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-06898,glo-sub-046,glo-cel-025,2023-06-01,5,POINT (121.0476753 14.4772396)
3,glo-txn-06899,glo-sub-046,glo-cel-029,2023-06-01,6,POINT (121.0485949 14.4716356)
8,glo-txn-06900,glo-sub-046,glo-cel-024,2023-06-01,7,POINT (121.0476021 14.4654285)
11,glo-txn-06901,glo-sub-046,glo-cel-042,2023-06-01,10,POINT (121.0548302 14.4679851)
13,glo-txn-06902,glo-sub-046,glo-cel-047,2023-06-01,13,POINT (121.0561115 14.4711265)
12,glo-txn-06903,glo-sub-046,glo-cel-042,2023-06-01,14,POINT (121.0548302 14.4679851)
9,glo-txn-06904,glo-sub-046,glo-cel-024,2023-06-01,16,POINT (121.0476021 14.4654285)
4,glo-txn-06905,glo-sub-046,glo-cel-029,2023-06-01,17,POINT (121.0485949 14.4716356)
1,glo-txn-06906,glo-sub-046,glo-cel-025,2023-06-01,18,POINT (121.0476753 14.4772396)
5,glo-txn-06907,glo-sub-046,glo-cel-029,2023-06-01,22,POINT (121.0485949 14.4716356)


In [40]:
proba_per_site = [hr_spent/sum(loc_hr_counter.values()) for hr_spent in loc_hr_counter.values()]

In [41]:
np.sum(proba_per_site)

1.0

In [42]:
activity_entropy = reduce(lambda a, b: a + b, map(lambda p: p*np.log10(1/p), proba_per_site))
print(activity_entropy)

# TODO: add logbase2
# try float dtype

1.2287708523253977


In [43]:
def calc_activity_entropy(traj):
    
    try:
    
        loc_hrs = traj[["cel_uid", "transaction_hr"]].values.tolist()
        od_loc_hrs = list(zip(loc_hrs, loc_hrs[1:]))    
        cels = list(set(traj.cel_uid.tolist()))

        loc_hr_counter = dict(zip(cels, [0]*len(cels)))
        for od_loc_hr in od_loc_hrs:
            orig = od_loc_hr[0]
            dest = od_loc_hr[1]
            cel = orig[0]
            if dest[1] > orig[1]:
                time_elapsed = dest[1] - orig[1]
            elif dest[1] < orig[1]:
                # next time elapsed jumps to another day
                time_elapsed = 24 - abs(dest[1] - orig[1])
            loc_hr_counter[cel] = loc_hr_counter[cel] + time_elapsed

        # time spent on a single site vs time spent on all sites
        proba_per_site = [hr_spent/sum(loc_hr_counter.values()) for hr_spent in loc_hr_counter.values()]

        # sum(p * log(1/p)) where p is the proba for a single site
        activity_entropy = reduce(lambda a, b: a + b, map(lambda p: p*np.log10(1/p), proba_per_site))
    
        return activity_entropy
    
    except:
        
        return None

In [44]:
activity_entropys = []
for sub in scoring_base.sub_uid.tolist():
    traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)
    activity_entropy = calc_activity_entropy(traj)
    activity_entropys.append(activity_entropy)

scoring_base["activity_entropy"] = activity_entropys

In [45]:
scoring_base.sample(5)

Unnamed: 0,sub_uid,gender,age,name,chi_indicator,ewallet_user_indicator,total_travel_distance,radius_of_gyration,activity_entropy
4,glo-sub-005,male,41,Stephen Barrett,False,Y,59372.71351,1123.953469,1.404239
51,glo-sub-052,female,60,Victoria Davis,True,N,63698.991406,1255.827815,1.274839
73,glo-sub-074,male,59,Austin Shelton,False,Y,54565.860869,437.301958,0.763808
5,glo-sub-006,male,30,Ronald Lester,False,N,49404.280524,750.467398,1.204405
12,glo-sub-013,female,70,Sharon Hogan,False,Y,62534.631257,583.661584,0.884228


In [46]:
scoring_base.total_travel_distance.describe().iloc[1:]

mean    55157.115296
std      7521.101911
min     39499.305833
25%     49938.614556
50%     54628.834575
75%     59577.137135
max     72655.926457
Name: total_travel_distance, dtype: float64

In [47]:
scoring_base.radius_of_gyration.describe().iloc[1:]

mean    1184.597493
std      373.011130
min      437.301958
25%      971.721602
50%     1150.023233
75%     1354.398274
max     2959.760407
Name: radius_of_gyration, dtype: float64

In [48]:
scoring_base.activity_entropy.describe().iloc[1:]

mean    1.255673
std     0.163772
min     0.763808
25%     1.148662
50%     1.283184
75%     1.379630
max     1.526844
Name: activity_entropy, dtype: float64

In [49]:
sub = fake_subscribers.sample(1).sub_uid.item()
traj = get_sub_traj(sub, date, window, fake_transactions, fake_cellsites)

In [50]:
traj.head()

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr,coords
0,glo-txn-00001,glo-sub-001,glo-cel-082,2023-06-01,3,POINT (121.0732848 14.5244576)
5,glo-txn-00002,glo-sub-001,glo-cel-086,2023-06-01,15,POINT (121.0750697 14.521689)
11,glo-txn-00003,glo-sub-001,glo-cel-084,2023-06-01,20,POINT (121.0735688 14.5181671)
6,glo-txn-00004,glo-sub-001,glo-cel-086,2023-06-01,21,POINT (121.0750697 14.521689)
12,glo-txn-00005,glo-sub-001,glo-cel-084,2023-06-01,23,POINT (121.0735688 14.5181671)


In [51]:
calc_activity_entropy(traj)

1.3153954278432864

In [52]:
calc_radius_of_gyration(traj)

1219.4454644797752

In [53]:
calc_total_travel_distance(traj)

55610.046496628296

In [54]:
def get_route_fig(r):
    fig, ax = plt.subplots(1, 1)
    gpd.GeoSeries(r).plot(ax=ax, linewidth=5, zorder=1)
    orig = shapely.geometry.Point([r.xy[0][0], r.xy[1][0]])
    dest = shapely.geometry.Point([r.xy[0][-1], r.xy[1][-1]])
    gpd.GeoSeries(orig).plot(ax=ax, color="red", markersize=250, zorder=2, alpha=0.8)
    gpd.GeoSeries(dest).plot(ax=ax, color="green", markersize=250, zorder=2, alpha=0.8)
    plt.axis("off")
    ax.ticklabel_format(useOffset=False)
    plt.close()
    return fig

import os
import shapely
import geopandas as gpd
import matplotlib.pyplot as plt
from PIL import Image
import shutil

In [56]:
d

Unnamed: 0,txn_uid,sub_uid,cel_uid,transaction_dt,transaction_hr
0,glo-txn-00001,glo-sub-001,glo-cel-082,2023-06-01,3
1,glo-txn-00002,glo-sub-001,glo-cel-086,2023-06-01,15
2,glo-txn-00003,glo-sub-001,glo-cel-084,2023-06-01,20
3,glo-txn-00004,glo-sub-001,glo-cel-086,2023-06-01,21
4,glo-txn-00005,glo-sub-001,glo-cel-084,2023-06-01,23
...,...,...,...,...,...
15085,glo-txn-15086,glo-sub-100,glo-cel-040,2023-06-29,12
15086,glo-txn-15087,glo-sub-100,glo-cel-040,2023-06-30,0
15087,glo-txn-15088,glo-sub-100,glo-cel-049,2023-06-30,3
15088,glo-txn-15089,glo-sub-100,glo-cel-057,2023-06-30,12


In [61]:
# day in a life of a sub
d = fake_transactions.copy()
sample_sub = d.sample(1).sub_uid.item()
d = d.loc[d.sub_uid==sample_sub]
days = d.transaction_dt.unique().tolist()
route_figs = []

In [66]:
for sample_day in days:
    sites = fake_transactions\
        .loc[fake_transactions.sub_uid==sample_sub]\
        .loc[fake_transactions.transaction_dt==sample_day]\
        .cel_uid.tolist()
    points = list(map(lambda z: convert_cel_to_point(z, fake_cellsites), sites))
    r = shapely.geometry.LineString(points)
    route_figs.append(get_route_fig(r))

AttributeError: 'DataFrame' object has no attribute 'uid'

In [None]:
os.mkdir("../sample/")

for idx, fig in enumerate(route_figs):
    fname = "../sample/{}_tmp.jpg".format(idx+1)
    fig.savefig(fname)
    
imgs = load_images("../sample")
plot_images(imgs)
shutil.rmtree("../sample")