Uss the files created using the scripts in `data` and sets up the whole training script.

In [1]:
import pandas as pd
import numpy as np
df_mw_download = pd.read_csv('data/mw_2016_download_info.csv')
df_mw_guide = pd.read_csv('data/mw_2016_guide.csv')

In [2]:
df_mw_download.shape, df_mw_guide.shape

((28113, 7), (90943, 6))

In [4]:
len(df_mw_guide.groupby(['clust_lat', 'clust_lon'])) # number of clusters

780

In [5]:
# merges the images downloaded with the original dataframe
df_sub = df_mw_download[['im_lat', 'im_lon', 'images']]
df_mw = pd.merge(left=df_mw_guide, right=df_sub, on=['im_lat', 'im_lon'])

In [7]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,images
0,-17.05,35.174999,-17.09515,35.217213,0.0,2.039307,-17.05_35.174999.png
1,-17.125,35.174999,-17.09515,35.217213,0.0,2.039307,-17.125_35.174999.png
2,-17.05,35.183332,-17.09515,35.217213,0.0,2.039307,-17.05_35.183332.png
3,-17.058333,35.183332,-17.09515,35.217213,0.0,2.039307,-17.058332999999998_35.183332.png
4,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,-17.1_35.183332.png


In [8]:
# I didn't download all the images and also I got rid of some image repeats, hence the difference
df_mw_guide.shape, df_mw.shape

((90943, 6), (54404, 7))

In [11]:
# adds a "cluster number" to the dataframe, kind of unnecessary because
# a groupby of "clust_lat" and "clust_lon" suffices
clust_group = df_mw.groupby(['clust_lat', 'clust_lon'])
clust_group = clust_group.first().reset_index()[['clust_lat', 'clust_lon']]
clust_numbers = np.arange(len(clust_group))
clust_group['clust_num'] = clust_numbers

In [12]:
clust_group.head()

Unnamed: 0,clust_lat,clust_lon,clust_num
0,-17.09515,35.217213,0
1,-17.092351,35.114643,1
2,-17.016698,35.079629,2
3,-16.977243,35.205706,3
4,-16.956385,35.168967,4


In [13]:
df_mw = pd.merge(df_mw, clust_group, on=['clust_lat', 'clust_lon'])
df_mw.head()

In [15]:
df_mw.shape

(54404, 8)

I'm gonna add an _ to the images to show which cluster they come from

In [16]:
df_mw['images'] = df_mw.apply(lambda x: x.images[:-4] + '_' + str(x.clust_num) + '.png', axis=1)

In [17]:
df_mw.head()

Unnamed: 0,im_lat,im_lon,clust_lat,clust_lon,nightlight,consumption,images,clust_num
0,-17.05,35.174999,-17.09515,35.217213,0.0,2.039307,-17.05_35.174999_0.png,0
1,-17.125,35.174999,-17.09515,35.217213,0.0,2.039307,-17.125_35.174999_0.png,0
2,-17.05,35.183332,-17.09515,35.217213,0.0,2.039307,-17.05_35.183332_0.png,0
3,-17.058333,35.183332,-17.09515,35.217213,0.0,2.039307,-17.058332999999998_35.183332_0.png,0
4,-17.1,35.183332,-17.09515,35.217213,0.0,2.039307,-17.1_35.183332_0.png,0


In [85]:
!mkdir ims

In [86]:
import shutil

In [87]:
# this will copy images into a folder called ims
# this folder is helpful because the original folder has all unique images
# now, we need to duplicate those images and distinguish them by their new name (as made previously)
def create_im_renamed(x):
    orig_name = x.split('_') # gets the original name of the image as it appears in the downloaded image folder
    shutil.copy('data/ims_malawi_2016/{}'.format(orig_name[0] + '_' + orig_name[1] + '.png'), 'ims/{}'.format(x))

In [88]:
df_mw['images'].apply(create_im_renamed)

0        None
1        None
2        None
3        None
4        None
5        None
6        None
7        None
8        None
9        None
10       None
11       None
12       None
13       None
14       None
15       None
16       None
17       None
18       None
19       None
20       None
21       None
22       None
23       None
24       None
25       None
26       None
27       None
28       None
29       None
         ... 
74022    None
74023    None
74024    None
74025    None
74026    None
74027    None
74028    None
74029    None
74030    None
74031    None
74032    None
74033    None
74034    None
74035    None
74036    None
74037    None
74038    None
74039    None
74040    None
74041    None
74042    None
74043    None
74044    None
74045    None
74046    None
74047    None
74048    None
74049    None
74050    None
74051    None
Name: images, Length: 74052, dtype: object

In [18]:
# a naive way to bin nightlights
(df_mw['nightlight'] < 1).mean()

0.5358061907212704

In [19]:
((df_mw['nightlight'] >= 1) & (df_mw_guide['nightlight'] < 10)).mean()

0.23951266177715713

In [20]:
((df_mw['nightlight'] >= 10)).mean()

0.10780457319314757

In [94]:
df_mw['nightlight_bin'] = (df_mw['nightlight'] < 1)*1 + \
                                ((df_mw['nightlight'] >= 1) & (df_mw['nightlight'] < 10))*2 + \
                                (df_mw['nightlight'] >= 10)*3

In [21]:
df_mw.to_csv('all_ims_guide.csv', index=False)

In [23]:
df_mw = pd.read_csv('all_ims_guide.csv')

In [24]:
# uses a GMM to determine the groups
# these should be balanced somewhat, so one class doesn't totally dominate
from sklearn.mixture import GaussianMixture as GMM
X = df_mw['nightlight'].values.reshape(-1,1)
gmm = GMM(n_components=3).fit(X)
labels = gmm.predict(df_mw['nightlight'].values.reshape(-1,1))

In [25]:
(labels==0).mean(), (labels==1).mean(), (labels==2).mean()

(0.43862583633556357, 0.054389383133593115, 0.5069847805308433)

In [10]:
df_mw['nightlight_bin'] = labels + 1

Create train/valid folders by copying images from `ims`

The model will use these directly.

In [11]:
import os, shutil
import numpy as np

In [12]:
pic_list = df_mw['images'].values.tolist()
to_pick = int(0.8*len(pic_list)); to_pick

59241

In [13]:
inds = np.arange(len(pic_list))
train_ind = np.random.choice(np.arange(len(pic_list)), to_pick, replace=False)
valid_ind = np.delete(inds, train_ind)

In [14]:
pic_list = np.array(pic_list)
train_im = pic_list[train_ind]
valid_im = pic_list[valid_ind]

In [16]:
os.makedirs('train', exist_ok=False)
os.makedirs('valid', exist_ok=False)

In [17]:
t = df_mw.iloc[train_ind]
v = df_mw.iloc[valid_ind]

In [18]:
for fi, l in zip(t['images'], t['nightlight_bin']):
    os.makedirs('train/{}'.format(l), exist_ok=True)
    shutil.copy('ims/{}'.format(fi), 'train/{}'.format(l))
    
for fi, l in zip(v['images'], v['nightlight_bin']):
    os.makedirs('valid/{}'.format(l), exist_ok=True)
    shutil.copy('ims/{}'.format(fi), 'valid/{}'.format(l))

In [19]:
# shows count distribution in train folder
for i in range(1,4):
    print(len(os.listdir('train/{}'.format(str(i)))))

30795
7197
21249


In [20]:
# shows count distribution in valid folder
for i in range(1,4):
    print(len(os.listdir('valid/{}'.format(str(i)))))

7706
1760
5345
