Script to download "2016" (really 2018-2019 but using the images determined by the script that was run on the 2016 data) Malawi images

In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls input/LSMS

malawi				  tanzania
Malawi 2013 LSMS (Cluster).txt	  Tanzania 2013 LSMS (Cluster).txt
Malawi 2013 LSMS (Household).txt  Tanzania 2013 LSMS (Household).txt
malawi_2016			  uganda
Malawi_2016_LSMS_(Cluster).txt	  Uganda 2012 LSMS (Cluster).txt
Malawi_2016_LSMS_(Household).txt  Uganda 2012 LSMS (Household).txt
pooled


In [3]:
df_mw = pd.read_csv('input/LSMS/malawi_2016/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])

In [4]:
df_mw.shape

(90943, 4)

# Create Datasets

Each country in this folder should have:
1. 'nightlights.npy'
2. 'consumptions.npy'
These are aggregated at a cluster level.

This function will add in these values at the cluster level for each image. That is, we now get a dataframe that has cluster nightlight and consumption values for each image.

In [6]:
def create_df(country, df_orig):
    c_nightlight = np.load('input/LSMS/{}/nightlights.npy'.format(country))
    c_consumption = np.load('input/LSMS/{}/consumptions.npy'.format(country))
    c_groups = df_orig.groupby(['clust_lat', 'clust_lon'])
    counts = c_groups.count()
    counts['nightlight'] = c_nightlight
    counts['consumption'] = c_consumption
    counts = counts.reset_index().drop(['im_lat', 'im_lon'], axis=1)
    df_c = pd.merge(left=df_orig, right=counts, on=['clust_lat', 'clust_lon'])
    return df_c

In [8]:
df_mw = create_df('malawi_2016', df_mw)

In [9]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption
0,-17.05,35.174999,-17.09515,35.217213,0.0,2.039307
1,-17.058333,35.174999,-17.09515,35.217213,0.0,2.039307
2,-17.066666,35.174999,-17.09515,35.217213,0.0,2.039307
3,-17.075,35.174999,-17.09515,35.217213,0.0,2.039307
4,-17.083333,35.174999,-17.09515,35.217213,0.0,2.039307


In [11]:
df_mw.shape

(90943, 6)

In [12]:
df_mw.to_csv('mw_2016_guide.csv', index=False)

In [13]:
# we don't need to download repeat images, as that reduces our download size significantly
# some images can belong to 2+ clusters
print(df_mw.drop_duplicates(['im_lat', 'im_lon']).shape)

(54716, 6)


In [14]:
mw_download = df_mw.drop_duplicates(['im_lat', 'im_lon'])

In [15]:
# most nightlights are 0
# let's download images that have nonzero nightlights to induce variety into the model
print((mw_download['nightlight'] == 0).mean())

0.600025586665692


In [16]:
# let's drop 75% of 0 nightlight images...
def drop_0s(df, frac=0.9):
    z_inds = np.argwhere(df['nightlight'].values == 0).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [17]:
mw_download = drop_0s(mw_download)

NameError: name 'tanz_download' is not defined

In [18]:
(mw_download['nightlight']==0).mean()

0.37748890658778017

In [19]:
mw_download.shape

(35156, 6)

In [20]:
mw_download = drop_0s(mw_download, frac=0.75)

In [21]:
(mw_download['nightlight']==0).mean()

0.22153452139579555

In [22]:
mw_download.shape

(28113, 6)

In [None]:
mw_download.to_csv('mw_2016_download.csv', index=False)

mw_download and mw_guide can be merged, as we didn't bother to redownload the same image, and also we removed some images so that our nightlight value is not just 0's.

# Download Images

Now we actually download images

In [25]:
"""Interface for downloading aerial imagery from Mapbox.
"""

import requests
from PIL import Image
from io import BytesIO
import os.path
import numpy as np
from mapbox import Maps # alternative mapbox api, is MUCH worse than google maps API
import matplotlib
matplotlib.use('PS')
import matplotlib.pyplot as plt
import math

def deg_to_tile(lat_deg, lon_deg, zoom):
    """Converts coordinates into the nearest x,y Slippy Map tile
    Only needed for Mapbox, which is not the default method
    """
    
    lat_rad = math.radians(lat_deg)
    n = 2.0 ** zoom
    xtile = int((lon_deg + 180.0) / 360.0 * n)
    ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad)))
                 / math.pi) / 2.0 * n)
    return (xtile, ytile)

class ImageryDownloader:
    def __init__(self, access_token):
        """Initializes the object with an access token"""
        self.access_token = access_token
        self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}'
#         self.maps = Maps(access_token=access_token)
    
    def download_tile(self, lat, long, zoom):
        """Downloads lat long
        """
        res = requests.get(self.url.format(lat, long, zoom, self.access_token))
        image = Image.open(BytesIO(res.content))
        # x,y = deg_to-tile(lat, long, zoom)
#         response = self.maps.tile("mapbox.satellite", x, y, zoom)
#         image = Image.open(BytesIO(response.content))

        return image
    
import math

In [26]:
access = 'AIzaSyBeJiUKJvtwgFqRJMw2S-cd2k02EkXIwbM' # google
# access = 'pk.eyJ1Ijoiam1hdGhlcjI1IiwiYSI6ImNrMWszNHoxNjI3c3EzZG50MTVheHFiZzgifQ.Kd97iNsyrrGbYCEHRuwAeA' # mapbox
im_downloader = ImageryDownloader(access)

In [30]:
!mkdir ims_malawi_2016

In [None]:
im_names = []
zoom = 16
for i, r in mw_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    xtile, ytile = deg_to_tile(lat, long, zoom) # max zoom
    try:
        im = im_downloader.download_tile(xtile, ytile, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims_malawi_2016/{}.png'.format(name))
        im_names.append(name + '.png')
    except:
        im_names.append(np.nan)
    if i % 100 == 0:
        # the counting is kind of off for some reason
        print(i, end=', ')
        
mw_download['images'] = im_names
mw_download.to_csv('mw_2016_download_info.csv', index=False)

0, 200, 400, 800, 900, 1600, 1800, 2500, 3000, 3200, 3600, 3700, 4600, 4800, 5400, 6000, 6200, 6500, 6900, 7000, 7100, 7200, 7700, 7800, 8300, 8400, 8800, 9000, 9200, 9300, 10000, 10500, 11100, 11200, 11300, 11400, 11500, 11700, 11800, 11900, 