In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls input/LSMS

malawi				  Tanzania 2013 LSMS (Cluster).txt
Malawi 2013 LSMS (Cluster).txt	  Tanzania 2013 LSMS (Household).txt
Malawi 2013 LSMS (Household).txt  uganda
pooled				  Uganda 2012 LSMS (Cluster).txt
tanzania			  Uganda 2012 LSMS (Household).txt


In [3]:
df_mw = pd.read_csv('input/LSMS/malawi/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])
df_tanz = pd.read_csv('input/LSMS/tanzania/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])
df_ug = pd.read_csv('input/LSMS/uganda/candidate_download_locs.txt', sep=' ', header=None, names=['im_lat', 'im_lon', 'clust_lat', 'clust_lon'])

In [4]:
df_mw.shape, df_tanz.shape, df_ug.shape

((108308, 4), (44262, 4), (74726, 4))

In [5]:
!ls

DHS		   LSMS		    tanz_download.csv	    ug_download.csv
downlod_ims.ipynb  mw_download.csv  tanz_guide.csv	    ug_guide.csv
eval_prog.ipynb    mw_guide.csv     test_im_download.ipynb


# Create Datasets

In [6]:
def create_df(country, df_orig):
    c_nightlight = np.load('input/LSMS/{}/nightlights.npy'.format(country))
    c_consumption = np.load('input/LSMS/{}/consumptions.npy'.format(country))
    c_groups = df_orig.groupby(['clust_lat', 'clust_lon'])
    counts = c_groups.count()
    counts['nightlight'] = c_nightlight
    counts['consumption'] = c_consumption
    counts = counts.reset_index().drop(['im_lat', 'im_lon'], axis=1)
    df_c = pd.merge(left=df_orig, right=counts, on=['clust_lat', 'clust_lon'])
    return df_c

In [7]:
df_mw = create_df('malawi', df_mw)

In [8]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption
0,-16.941666,35.208332,-16.9855,35.2499,0.036855,1.951277
1,-16.95,35.208332,-16.9855,35.2499,0.036855,1.951277
2,-16.958333,35.208332,-16.9855,35.2499,0.036855,1.951277
3,-16.966666,35.208332,-16.9855,35.2499,0.036855,1.951277
4,-16.975,35.208332,-16.9855,35.2499,0.036855,1.951277


In [9]:
df_tanz = create_df('tanzania', df_tanz)

In [10]:
df_ug = create_df('uganda', df_ug)

In [11]:
df_mw.shape, df_tanz.shape, df_ug.shape

((108308, 6), (44262, 6), (74726, 6))

In [12]:
df_mw.to_csv('mw_guide.csv', index=False)
df_tanz.to_csv('tanz_guide.csv', index=False)
df_ug.to_csv('ug_guide.csv', index=False)

In [13]:
# we don't need to download repeat images
print(df_mw.drop_duplicates(['im_lat', 'im_lon']).shape)
print(df_tanz.drop_duplicates(['im_lat', 'im_lon']).shape)
print(df_ug.drop_duplicates(['im_lat', 'im_lon']).shape)

(49145, 6)
(22782, 6)
(48435, 6)


In [14]:
mw_download = df_mw.drop_duplicates(['im_lat', 'im_lon'])
tanz_download = df_tanz.drop_duplicates(['im_lat', 'im_lon'])
ug_download = df_ug.drop_duplicates(['im_lat', 'im_lon'])

In [15]:
print((mw_download['nightlight'] == 0).mean())
print((tanz_download['nightlight'] == 0).mean())
print((ug_download['nightlight'] == 0).mean())

0.5755214162173161
0.6453779299446932
0.7126664602043976


In [16]:
# let's drop 75% of 0 nightlight images...
def drop_0s(df, frac=0.9):
    z_inds = np.argwhere(df['nightlight'].values == 0).reshape(-1)
    drop = np.random.choice(z_inds, int(frac*len(z_inds))).reshape(-1)
    return df.drop(df.index[drop])

In [17]:
mw_download = drop_0s(mw_download)
tanz_download = drop_0s(tanz_download)
ug_download = drop_0s(ug_download)

In [18]:
(mw_download['nightlight']==0).mean(), (tanz_download['nightlight']==0).mean(), (ug_download['nightlight']==0).mean()

(0.35592330729568683, 0.4253094323516859, 0.5006458557588805)

In [19]:
mw_download.shape, tanz_download.shape, ug_download.shape

((32389, 6), (14058, 6), (27870, 6))

In [20]:
mw_download = drop_0s(mw_download, frac=0.75)
tanz_download = drop_0s(tanz_download, frac=0.75)
ug_download = drop_0s(ug_download, frac=0.75)

In [21]:
(mw_download['nightlight']==0).mean(), (tanz_download['nightlight']==0).mean(), (ug_download['nightlight']==0).mean()

(0.20894164043836033, 0.26050343249427915, 0.32366234144919087)

In [22]:
mw_download.shape, tanz_download.shape, ug_download.shape

((26371, 6), (10925, 6), (20577, 6))

# Create Download Datasets

Let's sample 40 clusters from each country.

In [27]:
mw_download.to_csv('mw_download.csv', index=False)
tanz_download.to_csv('tanz_download.csv', index=False)
ug_download.to_csv('ug_download.csv', index=False)

# Download Class

In [28]:
"""Interface for downloading aerial imagery from Mapbox.
"""

import requests
from PIL import Image
from io import BytesIO
import os.path
import numpy as np
from mapbox import Maps
import matplotlib
matplotlib.use('PS')
import matplotlib.pyplot as plt

class ImageryDownloader:

    def __init__(self, access_token, source='google'):
        """Initializes the object with a Mapbox access token"""
        self.access_token = access_token
        self.url = 'https://maps.googleapis.com/maps/api/staticmap?center={},{}&zoom={}&size=400x400&maptype=satellite&key={}'
#         self.maps = Maps(access_token=access_token)
    
    def download_tile(self, lat, long, zoom):
        """Downloads lat long
        """
        res = requests.get(self.url.format(lat, long, zoom, self.access_token))
        image = Image.open(BytesIO(res.content))
#         response = self.maps.tile("mapbox.satellite", x, y, zoom)
#         image = Image.open(BytesIO(response.content))

        return image
    
import math

def deg_to_tile(lat_deg, lon_deg, zoom):
    """Converts coordinates into the nearest x,y Slippy Map tile"""
    lat_rad = math.radians(lat_deg)
    n = 2.0 ** zoom
    xtile = int((lon_deg + 180.0) / 360.0 * n)
    ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad)))
                 / math.pi) / 2.0 * n)
    return (xtile, ytile)

In [29]:
access = 'AIzaSyBeJiUKJvtwgFqRJMw2S-cd2k02EkXIwbM'
# access = 'pk.eyJ1Ijoiam1hdGhlcjI1IiwiYSI6ImNrMWszNHoxNjI3c3EzZG50MTVheHFiZzgifQ.Kd97iNsyrrGbYCEHRuwAeA'
im_downloader = ImageryDownloader(access)

In [30]:
import math

def deg_to_tile(lat_deg, lon_deg, zoom):
    """Converts coordinates into the nearest x,y Slippy Map tile"""
    lat_rad = math.radians(lat_deg)
    n = 2.0 ** zoom
    xtile = int((lon_deg + 180.0) / 360.0 * n)
    ytile = int((1.0 - math.log(math.tan(lat_rad) + (1 / math.cos(lat_rad)))
                 / math.pi) / 2.0 * n)
    return (xtile, ytile)

In [31]:
import time
# time.sleep(5) # sleep for 5 secs

In [32]:
mw_download.shape

(26371, 6)

In [None]:
im_names = []
zoom = 16
for i, r in mw_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    xtile, ytile = deg_to_tile(lat, long, zoom) # max zoom
    try:
        im = im_downloader.download_tile(xtile, ytile, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims/{}.png'.format(name))
        im_names.append(name)
    except:
        im_names.append(np.nan)
    if i % 100 == 0:
        print(i, end=', ')
        
mw_download['images'] = im_names
mw_download.to_csv('mw_download_info.csv', index=False)

0, 100, 200, 300, 500, 700, 1500, 1700, 2400, 2500, 2600, 

In [None]:
ug_download.shape

In [None]:
im_names = []
zoom = 16
for i, r in ug_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    xtile, ytile = deg_to_tile(lat, long, zoom) # max zoom
    try:
        im = im_downloader.download_tile(xtile, ytile, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims/{}.png'.format(name))
        im_names.append(name)
    except:
        im_names.append(np.nan)
    if i % 100 == 0:
        print(i, end=', ')
        
ug_download['images'] = im_names
ug_download.to_csv('ug_download_info.csv', index=False)

In [None]:
tanz_download.shape

In [None]:
im_names = []
zoom = 16
for i, r in tanz_download.iterrows():
    lat = r.im_lat
    long = r.im_lon
    xtile, ytile = deg_to_tile(lat, long, zoom) # max zoom
    try:
        im = im_downloader.download_tile(xtile, ytile, zoom)
        name = str(lat) + '_' + str(long)
        im.save('ims/{}.png'.format(name))
        im_names.append(name)
    except:
        im_names.append(np.nan)
        time.sleep(0.25) # prevent exceptions from just stacking, give server a rest
    if i % 100 == 0:
        print(i, end=', ')
        
tanz_download['images'] = im_names
tanz_download.to_csv('tanz_download_info.csv', index=False)