<a href="https://colab.research.google.com/github/kywch/geo-colab/blob/master/Generate_Chicago_Traces.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary packages
import os, os.path
from glob import glob
from shutil import copyfile

import pandas as pd

from secrets import choice


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
files = glob("drive/My Drive/SatTemp/chicago_synthpop_V2/*.csv.gz")
print(files)

['drive/My Drive/SatTemp/chicago_synthpop_V2/persons_v2.csv.gz', 'drive/My Drive/SatTemp/chicago_synthpop_V2/places_v2_256p_lb_v2.csv.gz', 'drive/My Drive/SatTemp/chicago_synthpop_V2/activities_v2.csv.gz', 'drive/My Drive/SatTemp/chicago_synthpop_V2/places_v2.csv.gz', 'drive/My Drive/SatTemp/chicago_synthpop_V2/places_60615.csv.gz', 'drive/My Drive/SatTemp/chicago_synthpop_V2/persons_60615.csv.gz']


In [4]:
!rm -rf synthpop
if not os.path.exists('synthpop'):
  os.mkdir('synthpop')

In [5]:
# copy these files from Google Drive to colab disk
for f in files:
  print(f)
  copyfile(f, 'synthpop/' + os.path.basename(f))

drive/My Drive/SatTemp/chicago_synthpop_V2/persons_v2.csv.gz
drive/My Drive/SatTemp/chicago_synthpop_V2/places_v2_256p_lb_v2.csv.gz
drive/My Drive/SatTemp/chicago_synthpop_V2/activities_v2.csv.gz
drive/My Drive/SatTemp/chicago_synthpop_V2/places_v2.csv.gz
drive/My Drive/SatTemp/chicago_synthpop_V2/places_60615.csv.gz
drive/My Drive/SatTemp/chicago_synthpop_V2/persons_60615.csv.gz


In [6]:
# look at people
persons_df = pd.read_csv('synthpop/persons_v2.csv.gz')
persons_df.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2927761 entries, 0 to 2927760
Data columns (total 23 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   numeric_id                int64  
 1   person_id                 object 
 2   hh_id                     float64
 3   relate                    float64
 4   sex                       int64  
 5   age                       int64  
 6   gq_id                     float64
 7   school_id                 float64
 8   work_id                   float64
 9   daycare_id                float64
 10  gym_id                    float64
 11  hospital_id               float64
 12  jail                      float64
 13  jail_id                   int64  
 14  rank                      int64  
 15  weekday_schedule_id_list  object 
 16  weekend_schedule_id_list  object 
 17  otherhh_id                object 
 18  home_restaurant_id        object 
 19  recreation_id             object 
 20  grocery_id              

In [7]:
# places
places_df = pd.read_csv('synthpop/places_v2.csv.gz')
places_df['numeric_id'] = places_df['numeric_id'].astype('int64')
places_df.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1210722 entries, 0 to 1210721
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   numeric_id  1210722 non-null  int64  
 1   place_id    1210722 non-null  object 
 2   place_type  1210722 non-null  object 
 3   name        1093276 non-null  object 
 4   persons     1077336 non-null  float64
 5   latitude    1210668 non-null  float64
 6   longitude   1210668 non-null  float64
 7   zipcode     1204493 non-null  float64
 8   zip_grp     1194456 non-null  float64
 9   rank        1210722 non-null  int64  
dtypes: float64(5), int64(2), object(3)
memory usage: 92.4+ MB


In [8]:
# make it faster to look-up
plidx_df = places_df.set_index('numeric_id')
plidx_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1210722 entries, 1 to 1210721
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   place_id    1210722 non-null  object 
 1   place_type  1210722 non-null  object 
 2   name        1093276 non-null  object 
 3   persons     1077336 non-null  float64
 4   latitude    1210668 non-null  float64
 5   longitude   1210668 non-null  float64
 6   zipcode     1204493 non-null  float64
 7   zip_grp     1194456 non-null  float64
 8   rank        1210722 non-null  int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 92.4+ MB


In [9]:
# activities
activity_df = pd.read_csv('synthpop/activities_v2.csv.gz')
activity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239781 entries, 0 to 239780
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Act_SlNo        239781 non-null  int64 
 1   start_time      239781 non-null  int64 
 2   stop_time       239781 non-null  int64 
 3   Social_Act_Loc  239781 non-null  object
 4   schedule_id     239781 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 9.1+ MB


In [10]:
# check the number of unique persons
print(len(persons_df.person_id), len(persons_df.person_id.unique()))
persons_df.head()

2927761 2927761


Unnamed: 0,numeric_id,person_id,hh_id,relate,sex,age,gq_id,school_id,work_id,daycare_id,gym_id,hospital_id,jail,jail_id,rank,weekday_schedule_id_list,weekend_schedule_id_list,otherhh_id,home_restaurant_id,recreation_id,grocery_id,worship_id,work_restaurant_id
0,1,1595930,1181524.0,1.0,2,82,,,,579139.0,1181263.0,70.0,2e-06,1194221,100,5969|5974|5983|5911|5963|5943|5953|5898|5930|5971,12237|12229|12172|12145|12178|12207|12203|1216...,,1197516|1200938|1195562|1196781|1195562,1205986|1206926|1206147|1207623|1207411,1210303|1210303|1210303,1208521.0,1196871|1195562|1200643|1200938|1195361
1,28,1597745,1181535.0,1.0,2,82,,,,598594.0,1181263.0,70.0,2e-06,1194221,135,5914|5905|5908|5903|5921|5911|5930|5985|5965|5924,12207|12158|12180|12219|12172|12168|12216|1215...,,1198007|1202973|1201510|1202764|1201711,1204187|1204188|1207623|1204185|1205247,1210303|1210303|1210271,1208491.0,1199052|1201611|1196871|1201222|1200938
2,335,1830168,1181651.0,2.0,2,80,,,,579512.0,1181504.0,67.0,7.5e-05,1194221,194,5904|5963|5898|5921|5933|5902|5962|5965|5960|5903,12179|12200|12180|12152|12214|12216|12186|1214...,1106212.0,1198239|1199738|1201342|1195604|1203026,1206581|1204593|1208218|1204610|1206073,1210268|1210416|1210263,1209887.0,1195893|1203057|1200382|1195061|1199562
3,462,1833607,1181696.0,7.0,2,82,,,,579254.0,1181263.0,70.0,2e-06,1194221,242,5941|5914|5902|5957|5931|5898|5985|5920|5981|5963,12229|12232|12191|12186|12226|12200|12152|1214...,,1199128|1200185|1195781|1196368|1199809,1204184|1206557|1206926|1204185|1205727,1210303|1210271|1210303,1209989.0,1201510|1196631|1194644|1201711|1201711
4,656,1907903,1181820.0,1.0,2,80,,,,598172.0,1181374.0,26.0,0.066281,1194221,84,5898|5929|5930|5913|5982|5924|5915|5903|5916|5900,12185|12193|12189|12178|12144|12152|12206|1216...,,1202359|1198950|1202748|1197743|1194509,1206868|1203252|1206596|1206316|1205545,1210457|1210489|1210268,1208383.0,1201032|1195967|1198994|1199567|1202452


# Things to do
* join zip code to person -- sample 1000 each?
* for each person
  * grab the activity schedule
  * link place coordinates
  * make a row for each hour: 24 rows with coordinates
  * the coordinates for 24 hours will be used to query temperature (or heat stress)


  

In [11]:
person_row = next(persons_df.iterrows())[1]
print(person_row)

numeric_id                                                                  1
person_id                                                             1595930
hh_id                                                             1.18152e+06
relate                                                                      1
sex                                                                         2
age                                                                        82
gq_id                                                                     NaN
school_id                                                                 NaN
work_id                                                                   NaN
daycare_id                                                             579139
gym_id                                                            1.18126e+06
hospital_id                                                                70
jail                                                            

In [12]:
%timeit places_df[places_df['numeric_id'] == 1210303]
%timeit places_df.query('numeric_id == 1210303')
%timeit plidx_df.loc[1210303]

The slowest run took 88.40 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 2.2 ms per loop
100 loops, best of 3: 10.7 ms per loop
The slowest run took 616.68 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 135 µs per loop


In [None]:
%timeit plidx_df.loc[[1197516,1200938,1195562,1196781,1195562]]
%timeit [plidx_df.loc[1197516], plidx_df.loc[1200938], plidx_df.loc[1195562], plidx_df.loc[1196781], plidx_df.loc[1195562]]
%timeit plidx_df.iat[1197517, 4]

1000 loops, best of 3: 463 µs per loop
1000 loops, best of 3: 667 µs per loop
The slowest run took 6.56 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 17.5 µs per loop


In [15]:
def match_activity_with_coord(person_row, activity_df, sch_type='weekday'):

  # retrieve a schedule
  curr_activity = activity_df[activity_df['schedule_id'] == int(choice(person_row[sch_type+'_schedule_id_list'].split('|')))].reset_index(drop=True)

  place_list = curr_activity['Social_Act_Loc'].to_list()
  place_info = []
  prev_place = ''

  item_to_key = {
    'Household': 'hh_id', 
    'Work Restaurant': 'work_restaurant_id',
    'Other': 'hh_id', 
    'Other Household': 'otherhh_id',
    'Place of worship': 'worship_id', 
    'Home Restaurant': 'home_restaurant_id', 
    'Recreation': 'recreation_id',
    'Grocery store': 'grocery_id', 
    'School': 'school_id', 
    'Workplace': 'work_id', 
    'Gym': 'gym_id', 
    'Prison': 'jail_id'
  }

  for item in place_list:
    if item == prev_place:
      curr_info = place_info[-1]
    else:
      if item in item_to_key:
        try:
          if type(row[item_to_key[item]]) is str:
            idx = int(choice(row[item_to_key[item]].split('|')))
          else:
            idx = int(row[item_to_key[item]])
          curr_info = [item, idx, plidx_df.iat[idx, 4], plidx_df.iat[idx, 5]]
        except:
          curr_info = [item, None, None, None]
      else:
        curr_info = [item, None, None, None]

    #print(curr_info)
    place_info.append(curr_info)
    prev_place = item

  # return the horizontally-concated dataframe
  return(pd.concat([curr_activity, 
                    pd.DataFrame(place_info, columns=['item', 'place_numeric', 'lat', 'lng'])], 
                    axis=1))

%timeit match_activity_with_coord(person_row, activity_df)
#print(place_info)  

100 loops, best of 3: 2.52 ms per loop


In [16]:
# hour, weight, social_act_loc, coordinates (lat/lng)
# using iat (instead of iloc) give ~10x speed improvement

def generate_hourly_activity(act_df):

  schedule = []
  for ii in range(act_df.shape[0]):
    curr_hour = act_df.iat[ii,1] // 60 #.iloc[ii].start_time // 60 
    proc_time = act_df.iat[ii,1] #.start_time

    # take care of the short activity
    if 60*(curr_hour+1) < act_df.iat[ii,2]: #iloc[ii].stop_time:
        schedule.append([curr_hour, 
                        (60*(curr_hour+1) - act_df.iat[ii,1])/60, 
                        act_df.iat[ii,3], act_df.iat[ii,7], act_df.iat[ii,8]]) #.iloc[ii].Social_Act_Loc])
        curr_hour += 1
        proc_time = 60*curr_hour

    # take care of the remaining long activity
    while proc_time < act_df.iat[ii,2]: #iloc[ii].stop_time:
      if (proc_time + 60) < act_df.iat[ii,2]: #iloc[ii].stop_time:
        schedule.append([curr_hour, 1, act_df.iat[ii,3], act_df.iat[ii,7], act_df.iat[ii,8]])
        proc_time += 60
        curr_hour += 1
      else:
        res_act = act_df.iat[ii,2] - proc_time
        schedule.append([curr_hour, res_act/60, act_df.iat[ii,3], act_df.iat[ii,7], act_df.iat[ii,8]])
        proc_time = act_df.iat[ii,2]
    
  # make the schedule compact
  compact_schedule = []
  compact_schedule.append(schedule.copy()[0])
  for item in schedule[1:]:
    # if the hour and activity are the same, just merge the time
    if (item[0] == compact_schedule[-1][0]) & (item[2] == compact_schedule[-1][2]):
      compact_schedule[-1][1] += item[1]
    else:
      compact_schedule.append(item)  

  # done
  return compact_schedule


In [17]:
row = persons_df.sample().iloc[0]
%timeit match_activity_with_coord(row, activity_df)

test = match_activity_with_coord(row, activity_df)
%timeit generate_hourly_activity(test)

100 loops, best of 3: 2.81 ms per loop
100 loops, best of 3: 5.41 ms per loop


## join zip code to person -- sample 1000 each?

In [18]:
# link people and zipcode
pp_loc_df = persons_df.merge(plidx_df['zipcode'], left_on='hh_id', right_index=True)
pp_loc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2885837 entries, 0 to 2927564
Data columns (total 24 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   numeric_id                int64  
 1   person_id                 object 
 2   hh_id                     float64
 3   relate                    float64
 4   sex                       int64  
 5   age                       int64  
 6   gq_id                     float64
 7   school_id                 float64
 8   work_id                   float64
 9   daycare_id                float64
 10  gym_id                    float64
 11  hospital_id               float64
 12  jail                      float64
 13  jail_id                   int64  
 14  rank                      int64  
 15  weekday_schedule_id_list  object 
 16  weekend_schedule_id_list  object 
 17  otherhh_id                object 
 18  home_restaurant_id        object 
 19  recreation_id             object 
 20  grocery_id              

In [20]:
zipcode = pp_loc_df.zipcode.unique()
len(zipcode)

56

In [42]:
tmp_zip = pp_loc_df.zipcode.value_counts()
zipcode = list(tmp_zip[tmp_zip > 1000].index)
len(zipcode)

53

In [43]:
sample = pd.DataFrame()

for zip in zipcode:
  sample = sample.append(pp_loc_df[pp_loc_df['zipcode'] == zip].sample(100))

sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5300 entries, 1987922 to 2118326
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   numeric_id                5300 non-null   int64  
 1   person_id                 5300 non-null   object 
 2   hh_id                     5300 non-null   float64
 3   relate                    5300 non-null   float64
 4   sex                       5300 non-null   int64  
 5   age                       5300 non-null   int64  
 6   gq_id                     0 non-null      float64
 7   school_id                 945 non-null    float64
 8   work_id                   2242 non-null   float64
 9   daycare_id                5300 non-null   float64
 10  gym_id                    5300 non-null   float64
 11  hospital_id               5300 non-null   float64
 12  jail                      5300 non-null   float64
 13  jail_id                   5300 non-null   int64  
 14 

In [45]:
sample.to_csv('drive/My Drive/SatTemp/chicago_synthpop_V2/sampled_persons.csv.gz')