In [1]:
from torchvision import transforms, models
import matplotlib.pyplot as plt
from sklearn import metrics
from copy import deepcopy
from PIL import Image
import pandas as pd
import numpy as np
import torchvision
import random
import torch

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [3]:
tfs = transforms.Compose([
        transforms.ToTensor(),
        transforms.CenterCrop(256),
        transforms.RandomHorizontalFlip(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

In [4]:
class Dataloader():
    
    def __init__(self, country, imagery_direc, scores_df, split, batch_size, tfs = None):
        """
        Arguments:
            country: one of ['mex', 'slv', 'peru', 'phl']
            imagery_direc: path to folder containing school imagery
            scores_df: path to CSV file with school IDs and test scroes
            split: train/test split, should be between .01 and 1, recommended is between .65 and .8
            batch_size: number of images in a batch
        """
        self.country = country
        self.imagery_direc = imagery_direc
        self.imagery = os.listdir(self.imagery_direc)
        self.imagery = [i for i in self.imagery if self.country in i]
        self.scores_df = pd.read_csv(scores_df)
        self.scores_df = self.scores_df[self.scores_df['country'] == self.country]
        self.split = split
        self.batch_size = batch_size
        
        if tfs is None:
            self.tfs = transforms.ToTensor()
        else:
            self.tfs = tfs
        
        # Load the data into a list with the format [(school_image, school_test_score), ...]
        self.data = self.load_data()
        
        
    def load_data(self):
        """
        Load the imagery into a list in the format: [(imager_tensor, test_score), ...]
        """
        data = []
        for col, row in self.scores_df.iterrows():
            school_id = str(row.school_id)
            test_score = row.y
            impath = [i for i in self.imagery if school_id in i]
            if len(impath) > 0:
                image = np.array(Image.open(self.imagery_direc + impath[0]))
                image = self.tfs(image)
                data.append((image, test_score, school_id))
        return data

In [5]:
COUNTRY = "bra"
BATCH_SIZE = 4
SPLIT = .75
IMAGERY_DIREC = "../../CCI/hmbaier/"
SCORES_DF = "./cci_final.csv"

In [6]:
data = Dataloader(country = COUNTRY, 
                  imagery_direc = IMAGERY_DIREC, 
                  scores_df = SCORES_DF,
                  split = SPLIT,
                  batch_size = BATCH_SIZE,
                  tfs = tfs)
all_data = data.data

In [7]:
# We'll use the to keep track of our training stastics (i.e. running training loss, running validation loss, etc...)
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = round(self.sum / self.count, 4)

In [8]:
# Set up a basic off the shelf 
model = models.resnet18(pretrained = True)
model.fc = torch.nn.Linear(512, 2)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

checkpoint = torch.load("./trained_phl_model.torch")["model_state_dict"]

model.load_state_dict(checkpoint)
model = model.to(device)

In [9]:
sm = torch.nn.Softmax()
trues, preds, ids = [], [], []

for (inputs, targets, school_id) in all_data:
        
    inputs, targets = inputs.unsqueeze(0).to(device), torch.tensor(targets)
    outputs = model(inputs)
    
    _, pred = torch.max(sm(outputs), 1)
        
    trues.append(targets.item())
    preds.append(pred.item())
    ids.append(school_id)

  if __name__ == '__main__':


In [11]:
preds_df = pd.DataFrame([preds, ids]).T
preds_df.columns = ["Predicted", "Id"]
preds_df = preds_df[["Id", "Predicted"]]
preds_df

Unnamed: 0,Id,Predicted
0,35399197,1
1,31350664,1
2,31349720,1
3,33062633,1
4,31128074,1
...,...,...
6736,13020811,1
6737,15048489,1
6738,50031112,1
6739,12015326,1


In [12]:
preds_df.to_csv("./predicted_brazil_pytorch.csv")

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score


cm = confusion_matrix(trues, preds)
cm

array([[   0, 2140],
       [   0, 4601]])

In [14]:
accuracy_score(trues, preds)

0.6825396825396826