In [2]:
import torch
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import ImageFolder
import torch.nn as nn
import torch.optim as optim
import torch.utils.data.dataset as dataset
import numpy as np
from osgeo import gdal
import os
import torch.nn.functional as F
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

from dataset import *
from model import *

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
IMG_FOLDER = 'E:/xplore_data/data/'

## Extract features

In [4]:
# Load trained model
net = initialize_model2()
net.classifier = nn.Sequential(
    nn.Linear(net.n_features, 100),
    nn.Sigmoid(),
    nn.Linear(100, 20),
    nn.Sigmoid(),
    nn.Linear(20, 7)
)
n_features = 20

SAVED_MODEL_PATH = 'checkpoints/vgg11bn_4_e2e_all'
net.load_state_dict(torch.load(SAVED_MODEL_PATH))

# Freeze layers
for param in net.parameters():
    param.requires_grad = False
    
# We just want to apply the feature extractor for now
net.classifier[3] = nn.Identity()
net.classifier[4] = nn.Identity()

net.eval()

VGG(
  (features): Sequential(
    (0): Conv2d(9, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): MaxPool2d(ke

In [5]:
# Get dataset
DATA_FILE = 'E:/xplore_data/data/images.h5'
HEALTH_FILE = 'data/dhs_gps.csv'
dimages = TestDataset(DATA_FILE, HEALTH_FILE)
dimagesloader = torch.utils.data.DataLoader(dimages, batch_size=64, shuffle=False, num_workers=0)

In [6]:
# Apply feature extractor to the dataset
n = len(dimages)
extracted_features = torch.zeros(n, n_features)
c_ids = np.zeros(n)
vac_rates = np.zeros((n,11))
i = 0
# Iterate over data.
net.to(device)
for x, _, z in dimagesloader:
    x = x.to(device)
    j = i + x.shape[0]
    with torch.set_grad_enabled(False):
        outputs = net(x)
        extracted_features[i:j, :] = torch.squeeze(outputs).cpu()
        c_ids[i:j] = z[:, 0]
        vac_rates[i:j] = z[:, 14:25]
    i += x.shape[0]
extracted_features = extracted_features.numpy()
c_ids = c_ids.astype(np.int)

In [7]:
torch.save(extracted_features, 'data/features_e2e.pt')

## Build dataset of built environment

In [8]:
extracted_features = torch.load('data/features_e2e.pt')

In [9]:
BUILT_FOLDER = 'E:/xplore_data/built/'
counts = np.zeros((889, 6), np.int)
for i, file in enumerate(os.listdir(BUILT_FOLDER)):
    img = load_file(BUILT_FOLDER, file)
    val, ct = torch.unique(img, return_counts=True)
    val = val.numpy().astype(np.int)-1
    ct = ct.numpy()
    counts[i, val] = ct

In [10]:
print(counts.shape)
print(extracted_features.shape)

(889, 6)
(889, 20)


In [11]:
# 0: water, 1: not built, 2-5: built from various times
built = counts[:, 2:].sum(axis=1)
water = counts[:, 0]
recent = counts[:, 2]
print(built.shape)

(889,)


In [56]:
from sklearn.linear_model import LinearRegression

def show_r2(x, y):
    reg = LinearRegression()
    x = x.reshape(889, -1)
    y = y.reshape(889, -1)
    reg.fit(x, y)
    print('%.3f' % reg.score(x, y))

## Check it out

In [59]:
# r2 between features and built 
from scipy.stats import pearsonr
for i in range(20):
    show_r2(built, extracted_features[:,i])

0.320
0.357
0.422
0.376
0.306
0.358
0.396
0.323
0.379
0.399
0.344
0.350
0.303
0.363
0.329
0.418
0.379
0.319
0.339
0.378


In [60]:
# Features and water
for i in range(20):
    show_r2(water, extracted_features[:,i])

0.000
0.001
0.000
0.000
0.000
0.001
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.000
0.001
0.000
0.001
0.002
0.001


In [61]:
vaccs = [0, 1, 2, 3, 4, 5, 9]
stats = np.loadtxt('data/dhs_gps.csv', skiprows=1, delimiter=',')
vac_rates = stats[:, 14:]
lat = stats[:,1]

In [62]:
# Built and vaccination rate
for i in range(len(vaccs)):
    show_r2(built, vac_rates[:,vaccs[i]])

0.116
0.119
0.116
0.127
0.134
0.147
0.124


In [69]:
# What about water?
for i in range(len(vaccs)):
    show_r2(water, vac_rates[:,vaccs[i]])

0.000
0.000
0.000
0.000
0.000
0.000
0.000


In [64]:
# What about latitude and features?
for i in range(20):
    show_r2(lat, extracted_features[:,i])

0.532
0.492
0.382
0.444
0.574
0.497
0.465
0.501
0.457
0.406
0.496
0.475
0.515
0.485
0.525
0.459
0.381
0.570
0.559
0.493


In [65]:
# Latitude and vaccination rate?
for i in range(len(vaccs)):
    show_r2(lat, vac_rates[:,vaccs[i]])

0.517
0.398
0.510
0.493
0.448
0.346
0.462


In [66]:
# combination of latitude and built?
lat_built = np.stack((lat, built)).transpose()

In [67]:
for i in range(20):
    show_r2(lat_built, extracted_features[:, i])

0.731
0.726
0.686
0.700
0.758
0.731
0.735
0.706
0.713
0.686
0.719
0.704
0.703
0.724
0.732
0.748
0.648
0.765
0.770
0.743


In [68]:
# Latitude and vaccination rate?
for i in range(len(vaccs)):
    show_r2(lat_built, vac_rates[:,vaccs[i]])

0.565
0.455
0.558
0.550
0.512
0.427
0.519


In [78]:
resids = np.loadtxt('data/dhs_gps.csv', skiprows=1, delimiter=',')
resids[:, 14:] = 0
for i in range(len(vaccs)):
    y = vac_rates[:, vaccs[i]]
    reg = LinearRegression()
    reg.fit(lat_built, y)
    res = y - reg.predict(lat_built)
    resids[:, 14+vaccs[i]] = res

In [80]:
np.savetxt('data/residuals.csv', resids, delimiter=',')

[0, 1, 2, 3, 4, 5, 9]