In [1]:
import rasterio
from shapely.geometry import Point
from rasterio.warp import calculate_default_transform, reproject, Resampling
import pandas as pd
import geopandas as gpd
from walkscore.api import WalkScoreAPI
import numpy as np

In [2]:
DATA_FOLDER = "../demographics_data/" #where is the data 
apikey = "mykey" ##key for walkscore-api

In [3]:
N = 1000 ##MINIMUM NUMBER OF USERS PER COUNTRY
MIN_TIME_COVERAGE = 0.7 ##MINIMUM TIME COVERAGE
DEMOGRAPHICS_FOLDER = '/data/work/user/laura_data/users_scales_urbanization/'
SCALES_FOLDER = '/data/work/user/laura_data/scales_by_user_0.5/'


# 1) process urbanization data

In [4]:

filename = DATA_FOLDER+'GHS_SMOD_POP2015_GLOBE_R2016A_54009_1k_v1_0/GHS_SMOD_POP2015_GLOBE_R2016A_54009_1k_v1_0.tif'
new_filename =  DATA_FOLDER+'wgs_urb.tif'

proj = "EPSG:54009"
dst_crs = 'EPSG:4326'

with rasterio.open(filename) as src:
    transform, width, height = calculate_default_transform(
        src.crs, dst_crs, src.width, src.height, *src.bounds)
    kwargs = src.meta.copy()
    kwargs.update({
        'crs': dst_crs,
        'transform': transform,
        'width': width,
        'height': height
    })

    with rasterio.open(new_filename, 'w', **kwargs) as dst:
        for i in range(1, src.count + 1):
            reproject(
                source=rasterio.band(src, i),
                destination=rasterio.band(dst, i),
                src_transform=src.transform,
                src_crs=src.crs,
                dst_transform=transform,
                dst_crs=dst_crs,
                resampling=Resampling.nearest)
            

            


# 2) Merge scales and demographics data

In [5]:

df_demographics = pd.read_parquet(DEMOGRAPHICS_FOLDER)
df_scales = pd.read_parquet(SCALES_FOLDER)


In [6]:
#merge scales and demographics
df_merged = df_scales.merge(df_demographics,on = "useruuid", how = 'left')
df_merged['sizes'] = df_merged['sizes'].apply(lambda x:sorted(x))
df_merged['n_scales'] = df_merged['scales'].apply(lambda x:len(x)+1)

# 3) Add urbanization

In [7]:
#This may take some time (adding the urbanization near the home location)
lon_lat = [[i[1], i[0]] for i in df_merged["home"].values]
with rasterio.open(new_filename) as lidar_dem:
    urbanizations = [i[0] for i in list(lidar_dem.sample(lon_lat))]


In [8]:
df_merged['urbanization'] = urbanizations

# 4) Add walkability for selected countries

In [9]:
walkscore_api = WalkScoreAPI(api_key = apikey)

In [None]:

# merge with walkability data

countries = ['United States','Australia','Canada','New Zealand']

shapefile_names = [DATA_FOLDER+'/walkability/us_shapefile.shp',
                   DATA_FOLDER+'/walkability/australia_blocks.shp',
                   DATA_FOLDER+'/walkability/ldb_000a16a_e.shp',
                   DATA_FOLDER+'/walkability/meshblock-2018-generalised.shp']

ids_blocks = ['GEOID','MB_16PID','DBUID','MB2018_V1_']
crs = [None, None, {'init':'epsg:3347'}, None]

data_countries = []
for country, filename, block_id, initial_crs in zip(countries, 
                                                                shapefile_names, 
                                                                ids_blocks, 
                                                                crs):
    print(country)
    #Rrad blocks files
    print('Reading shapefile')
    gdf = gpd.read_file(filename)
    blocks = gdf[[block_id,'geometry']].dropna()
    blocks.crs = initial_crs
    blocks['centroid'] = blocks['geometry'].centroid

    #scales data
    print('Selecting scales data')
    data_country = df_merged[df_merged.NAME_0==country][['home','useruuid']]
    data_country['geometry'] = data_country['home'].apply(lambda x:Point(tuple((x[1],x[0]))))
    data_country = gpd.GeoDataFrame(data_country, geometry = 'geometry')

    #Merge the two
    print('Merging geometries')
    data_country = gpd.sjoin(data_country, blocks, op = 'within', how = 'left')


    #Find walkability 
    unique_id = data_country[[block_id,'centroid']].drop_duplicates(subset = [block_id])
    N = len(unique_id)
    n = 0 
    scores = []
    for point in unique_id['centroid'].values[n:]:
        try:
            lat,lon  = point.x, point.y
            result = walkscore_api.get_score(lon, lat)
            scores.append(result.walk_score)
        except:
            scores.append(np.nan)
            
    unique_id['score'] = scores
    data_country['score'] = data_country[block_id].map(dict(zip(unique_id[block_id], unique_id.score)))
    data_countries.append(data_country)
walkability_scores = pd.concat([i[['useruuid','score']] for i in data_countries])

United States
Reading shapefile
Selecting scales data
Merging geometries


In [None]:
#Merge walkability and previous data
df_merged = pd.merge(walkability_scores, df_merged,on = ['useruuid'],how = 'outer')

# 5) write to file

In [None]:
#First compute how many users per country and filter those with more than N individuals
countries_size = df_merged.groupby(['NAME_0','gender']).size().sort_values(ascending = False).reset_index().pivot_table(values = 0, index = ['NAME_0'], columns = ['gender'])
countries = countries_size[(countries_size.FEMALE+countries_size.MALE)>N].index.values

In [None]:
#Then FILTER RELEVANT COUNTRIES AND USERS
df_merged_filtered = df_merged[(df_merged['time_coverage']>MIN_TIME_COVERAGE) & (df_merged.NAME_0.isin(countries))].copy()
df_merged_filtered['urban_rural'] = df_merged_filtered['urbanization'].apply(lambda x: 'urban' if x==3 else('rural' if x<2 else np.nan))
df_merged_filtered[['NAME_0','gender','urban_rural','n_scales','sizes',"score"]].to_pickle("../outputs/Figure2/data.pkl")

In [None]:
####REMOVE GROUPS WITH <5 INDIVIDUALS (for sharing data purposes)
size_by_group = df_merged_filtered.groupby(['NAME_0','gender','urban_rural']).size()
df_merged_filtered_shareable = df_merged_filtered.merge(size_by_group[size_by_group>5].reset_index(),on = ['NAME_0','gender',"urban_rural"],how = 'inner')[['NAME_0','gender','urban_rural','n_scales','sizes']].copy()
df_merged_filtered_shareable.to_pickle("../outputs/Figure2/data_safe.pkl")

In [None]:
df_merged_filtered_shareable.to_csv("../outputs/Figure2/data_safe.csv",index=False)