In [2]:
import os
import random
import shutil
import time
import warnings
from datetime import datetime
import argparse

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.backends.cudnn as cudnn
import torch.optim as optim

from torch.cuda.amp import autocast, GradScaler

import torch.utils.data
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models

from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('logs')

SEED=1
random.seed(SEED)
torch.manual_seed(SEED)
cudnn.deterministic = True

GPU = 'cuda:0'
START_EPOCH = 0
ARCH = 'densenet'
EPOCHS = 10
LR = .0005
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4
PRINT_FREQ = 50
BATCH_SIZE = 100
WORKERS=2
#TRAINDIR="data/training/DEWP_class"
#VALDIR="data/test/DEWP_class"
imagenet_mean_RGB = [0.47889522, 0.47227842, 0.43047404]
imagenet_std_RGB = [0.229, 0.224, 0.225]

In [3]:
def accuracy(output, target, topk=(1,)):
    """Computes the accuracy over the k top predictions for the specified values of k"""
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)
        
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self, name, fmt=':f'):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

class ProgressMeter(object):
    def __init__(self, num_batches, meters, prefix=""):
        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
        self.meters = meters
        self.prefix = prefix

    def display(self, batch):
        entries = [self.prefix + self.batch_fmtstr.format(batch)]
        entries += [str(meter) for meter in self.meters]
        print('\t'.join(entries))

    def _get_batch_fmtstr(self, num_batches):
        num_digits = len(str(num_batches // 1))
        fmt = '{:' + str(num_digits) + 'd}'
        return '[' + fmt + '/' + fmt.format(num_batches) + ']'

def train(train_loader, model, criterion, optimizer, scaler, epoch):

    batch_time = AverageMeter('Time', ':6.3f')
    data_time = AverageMeter('Data', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top2 = AverageMeter('Acc@2', ':6.2f')
    progress = ProgressMeter(
        len(train_loader),
        [batch_time, data_time, losses, top1, top2],
        prefix="Epoch: [{}]".format(epoch))

    model.train()

    end = time.time()
    for i, (images, target) in enumerate(train_loader):
                
        images = images.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        optimizer.zero_grad()
        with autocast():
            output = model(images)
            loss = criterion(output, target)

        acc1, acc2 = accuracy(output, target, topk=(1, 2))
        losses.update(loss.item(), images.size(0))
        top1.update(acc1[0], images.size(0))
        top2.update(acc2[0], images.size(0))

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        batch_time.update(time.time() - end)
        end = time.time()
        
        if i % PRINT_FREQ == 0:
            progress.display(i)

            writer.add_scalar('Train Acc@1',
                acc1[0],
                epoch * len(train_loader) + i)

def validate(val_loader, model, criterion, epoch):

    batch_time = AverageMeter('Time', ':6.3f')
    losses = AverageMeter('Loss', ':.4e')
    top1 = AverageMeter('Acc@1', ':6.2f')
    top2 = AverageMeter('Acc@2', ':6.2f')
    progress = ProgressMeter(
        len(val_loader),
        [batch_time, losses, top1, top2],
        prefix='Test: ')

    model.eval()

    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(val_loader):
            
            images = images.cuda(non_blocking=True)
            target = target.cuda(non_blocking=True)
            
            output = model(images)
            loss = criterion(output, target)

            acc1, acc2 = accuracy(output, target, topk=(1, 2))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top2.update(acc2[0], images.size(0))
            batch_time.update(time.time() - end)
            end = time.time()

            if i % PRINT_FREQ == 0:
                progress.display(i)

                writer.add_scalar('Validation Acc@1',
                    acc1[0],
                    epoch * len(val_loader) + i)

def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)

In [4]:
##From PyTorch docs https://pytorch.org/tutorials/beginner/finetuning_torchvision_models_tutorial.html 

def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

def initialize_model(model_name, num_classes, feature_extract, use_pretrained=True):
    # Initialize these variables which will be set in this if statement. Each of these
    #   variables is model specific.
    model_ft = None
    input_size = 0

    if model_name == "resnet":
        """ Resnet18
        """
        model_ft = models.resnet18(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "alexnet":
        """ Alexnet
        """
        model_ft = models.alexnet(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "vgg":
        """ VGG11_bn
        """
        model_ft = models.vgg11_bn(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier[6].in_features
        model_ft.classifier[6] = nn.Linear(num_ftrs,num_classes)
        input_size = 224

    elif model_name == "squeezenet":
        """ Squeezenet
        """
        model_ft = models.squeezenet1_0(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        model_ft.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
        model_ft.num_classes = num_classes
        input_size = 224

    elif model_name == "densenet":
        """ Densenet
        """
        model_ft = models.densenet121(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        num_ftrs = model_ft.classifier.in_features
        model_ft.classifier = nn.Linear(num_ftrs, num_classes)
        input_size = 224

    elif model_name == "inception":
        """ Inception v3
        Be careful, expects (299,299) sized images and has auxiliary output
        """
        model_ft = models.inception_v3(pretrained=use_pretrained)
        set_parameter_requires_grad(model_ft, feature_extract)
        # Handle the auxilary net
        num_ftrs = model_ft.AuxLogits.fc.in_features
        model_ft.AuxLogits.fc = nn.Linear(num_ftrs, num_classes)
        # Handle the primary net
        num_ftrs = model_ft.fc.in_features
        model_ft.fc = nn.Linear(num_ftrs,num_classes)
        input_size = 299

    else:
        print("Invalid model name, exiting...")
        exit()

    return model_ft, input_size

In [5]:
def single_init(train_dir, val_dir, num_classes):
    model_ft, IMG_SIZE = initialize_model(ARCH, num_classes, False, use_pretrained=False)

    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(IMG_SIZE),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(imagenet_mean_RGB, imagenet_std_RGB)
    ])

    transform_val = transforms.Compose([
        transforms.Resize(IMG_SIZE),
        transforms.CenterCrop(IMG_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(imagenet_mean_RGB, imagenet_std_RGB)
    ])

    torch.cuda.set_device(GPU)
    model_ft.cuda(GPU)
    criterion = nn.CrossEntropyLoss().cuda(GPU)
    scaler = GradScaler()

    optimizer = optim.SGD(
        model_ft.parameters(),
        lr=LR,
        weight_decay=WEIGHT_DECAY,
        momentum=MOMENTUM
    )
    
    model = model_ft

    # use CosineAnnealingLR
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=EPOCHS)
    
    train_dataset = torchvision.datasets.ImageFolder(train_dir, transform=transform_train)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=BATCH_SIZE,shuffle=True)

    val_dataset = torchvision.datasets.ImageFolder(val_dir, transform=transform_val)
    val_loader = torch.utils.data.DataLoader(val_dataset,batch_size=BATCH_SIZE, shuffle=True) 

    return (model, criterion, optimizer, scheduler, scaler, train_loader, val_loader)

In [6]:
TEMP_CHECKPOINT_PATH = "./checkpoint_temp.pth.tar"
DEWP_CHECKPOINT_PATH = "./checkpoint_dewp.pth.tar"
VISIB_CHECKPOINT_PATH = "./checkpoint_visib.pth.tar"
WDSP_CHECKPOINT_PATH = "./checkpoint_wdsp.pth.tar"

TEMP_training_dir = "../../../data/training/TEMP_class"
TEMP_test_dir = "../../../data/test/TEMP_class"
DEWP_training_dir = "../../../data/training/DEWP_class"
DEWP_test_dir = "../../../data/test/DEWP_class"
VISIB_training_dir = "../../../data/training/VISIB_class"
VISIB_test_dir = "../../../data/test/VISIB_class"
WDSP_training_dir = "../../../data/training/WDSP_class"
WDSP_test_dir = "../../../data/test/WDSP_class"

In [7]:
device = torch.device(GPU)

dewp_model, _, _, _, _, _, _ = single_init(DEWP_training_dir, DEWP_test_dir, num_classes=3)
dewp_checkpoint = torch.load(DEWP_CHECKPOINT_PATH)
dewp_model.load_state_dict(dewp_checkpoint['state_dict'])
dewp_model.to(device)

temp_model, _, _, _, _, _, _ = single_init(TEMP_training_dir, TEMP_test_dir, num_classes=4)
temp_checkpoint = torch.load(TEMP_CHECKPOINT_PATH)
temp_model.load_state_dict(temp_checkpoint['state_dict'])
temp_model.to(device)

visib_model, _, _, _, _, _, _ = single_init(VISIB_training_dir, VISIB_test_dir, num_classes=3)
visib_checkpoint = torch.load(VISIB_CHECKPOINT_PATH)
visib_model.load_state_dict(visib_checkpoint['state_dict'])
visib_model.to(device)

wdsp_model, _, _, _, _, _, _ = single_init(WDSP_training_dir, WDSP_test_dir, num_classes=3)
wdsp_checkpoint = torch.load(WDSP_CHECKPOINT_PATH)
wdsp_model.load_state_dict(wdsp_checkpoint['state_dict'])
wdsp_model.to(device)

#Setting model to eval mode
dewp_model.eval()
temp_model.eval()
visib_model.eval()
wdsp_model.eval()




DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu

In [70]:
##From: https://towardsdatascience.com/how-to-train-an-image-classifier-in-pytorch-and-use-it-to-perform-basic-inference-on-single-images-99465a1e9bf5
opts = {
    'image_size':224,
    #'rgb_mean':[0.3820, 0.4122, 0.4279],
    #'rgb_std':[0.2998, 0.2836, 0.2907],
    'rgb_mean':[0.47889522, 0.47227842, 0.43047404],
    'rgb_std':[0.229, 0.224, 0.225],
}


test_transforms = transforms.Compose([
        transforms.Resize(opts['image_size']),
        transforms.CenterCrop(opts['image_size']),
        transforms.ToTensor(),
        transforms.Normalize(opts['rgb_mean'], opts['rgb_std'])
    ])

def predict_image(model, image, top_n = 2):
    image_tensor = test_transforms(image).float()
    input = image_tensor.unsqueeze_(0).to(device)
    output = torch.topk(model(input), top_n).indices
    return (int(x) for x in output[0])

def get_random_images(num,directory):
    data = datasets.ImageFolder(directory, transform=test_transforms)
    indices = list(range(len(data)))
    np.random.shuffle(indices)
    idx = indices[:num]
    from torch.utils.data.sampler import SubsetRandomSampler
    sampler = SubsetRandomSampler(idx)
    loader = torch.utils.data.DataLoader(data, 
                   sampler=sampler, batch_size=num)
    dataiter = iter(loader)
    images, labels = next(dataiter)
    return images, labels

def get_all_images(directory):
    data = datasets.ImageFolder(directory, transform=test_transforms)
    #print("Total files:", len(data))
    loader = torch.utils.data.DataLoader(
        dataset=data, 
        batch_size=len(data),
        shuffle=False
    )
    dataiter = iter(loader)
    images, labels = next(dataiter)
    return images, data.imgs

In [100]:
import pandas as pd
import datetime

class WeatherModelResult:
    def __init__(self, weather_path = "./weather_class.pkl"):
        self.weather = pd.read_pickle(weather_path)
    
    """
    Get score based on identified weather
    Second top classes are optional
    0.25 for every matching top 1 class
    0.15 for every matching top 2 class
    if any of the weather attribute is missing for the location & date combination, -1 is returned
    """
    def get_score(self, date, lat, lon, dewp1, temp1, visib1, wdsp1, dewp2=98, temp2=98, visib2=98, wdsp2=98):
        #Get actual weather by matching geocode and date
        actual_weather = self.weather[(self.weather.date==date) & (self.weather.LAT==lat) & (self.weather.LON==lon)]

        #missing data is represented as class 99. 
        a_dewp = a_temp = a_visib = a_wdsp = 99
        #default score is -1 if any weather class is missing
        score1 = score2 = -1
        if actual_weather.shape[0] > 0:
            a_dewp = actual_weather.DEWP_class.values[0]
            a_temp = actual_weather.TEMP_class.values[0]
            a_visib = actual_weather.VISIB_class.values[0]
            a_wdsp = actual_weather.WDSP_class.values[0]

        if a_dewp<99 and a_temp<99 and a_visib<99 and a_wdsp<99:
            #score 0.25 if top class matches
            #score 0.15 if second top class matches
            score1 = (0.25 if a_dewp == dewp1 else 0) + \
                    (0.25 if a_temp == temp1 else 0) + \
                    (0.25 if a_visib == visib1 else 0) + \
                    (0.25 if a_wdsp == wdsp1 else 0)
            score2 = score1 + \
                    (0.15 if a_dewp == dewp2 else 0) + \
                    (0.15 if a_temp == temp2 else 0) + \
                    (0.15 if a_visib == visib2 else 0) + \
                    (0.15 if a_wdsp == wdsp2 else 0)
        
        return score1, score2

In [101]:
weather_result = WeatherModelResult()

In [102]:
weather_result.get_score(
    date = datetime.datetime.strptime("2007-01-01", "%Y-%m-%d").date(),
    lat = 60,
    lon = 5,
    dewp1 = 2,
    temp1 = 3,
    visib1 = 1,
    wdsp1 = 1,
    wdsp2 = 0
)

(0.75, 0.9)

In [103]:
weather_result.get_score(
    date = datetime.datetime.strptime("2007-01-01", "%Y-%m-%d").date(),
    lat = 60,
    lon = 5,
    dewp1 = 2,
    temp1 = 3,
    visib1 = 1,
    wdsp1 = 0,
    dewp2 = 1,
    temp2 = 0,
    visib2 = 2,
    wdsp2 = 1
)

(1.0, 1.0)

In [104]:
weather_result.get_score(
    date = datetime.datetime.strptime("2007-01-01", "%Y-%m-%d").date(),
    lat = 60,
    lon = 5,
    dewp1 = 1,
    temp1 = 0,
    visib1 = 2,
    wdsp1 = 1,
    dewp2 = 2,
    temp2 = 3,
    visib2 = 1,
    wdsp2 = 0
)

(0, 0.6)

In [105]:
weather_result.get_score(
    date = datetime.datetime.strptime("2007-01-01", "%Y-%m-%d").date(),
    lat = 60,
    lon = 50,
    dewp1 = 1,
    temp1 = 0,
    visib1 = 2,
    wdsp1 = 1,
    dewp2 = 2,
    temp2 = 3,
    visib2 = 1,
    wdsp2 = 0
)

(-1, -1)

In [89]:
weather_result.get_score(
    date = datetime.datetime.strptime("2022-01-01", "%Y-%m-%d").date(),
    lat = 60,
    lon = 5,
    dewp1 = 1,
    temp1 = 0,
    visib1 = 2,
    wdsp1 = 1,
    dewp2 = 2,
    temp2 = 3,
    visib2 = 1,
    wdsp2 = 0
)

(-1, -1)

In [73]:
df_images = pd.read_pickle("../../../data/images_2007_2012.pickle")
print("total img count", df_images.shape[0])

#filter for outdoor images
df_labels = pd.read_csv('../../../data/meta_all.csv', usecols = ['id'])
df_images = df_images[df_images.id.isin(df_labels.id)]
print("outdoor img count", df_images.shape[0])

#df_images['path'] = df_images.id.apply(lambda img_id: f"../../../data/download/1/{img_id}.jpg")
#df_images['exists'] = df_images.path.apply(os.path.isfile)
#df_images = df_images[df_images.exists==True]
#print("available outdoor images", df_images.shape[0])

df_images.head(2)

total img count 13181139
outdoor img count 228357


Unnamed: 0,id,latitude,longitude,date_taken
43,1469526786,44,-80,2007-10-01
220,1399637907,48,-123,2007-09-15


In [96]:
to_pil = transforms.ToPILImage()

images, files = get_all_images("../../../data/final_test")

#print(np.shape(images))
data = []



for img, file in zip(images, files):
    image = to_pil(img)
    img_id = int(file[0].split('/')[-1].replace('.jpg',''))
    
    (dewp1, dewp2) = predict_image(dewp_model, image, top_n = 2)
    (temp1, temp2) = predict_image(temp_model, image, top_n = 2)
    (visib1, visib2) = predict_image(visib_model, image, top_n = 2)
    (wdsp1, wdsp2) = predict_image(wdsp_model, image, top_n = 2)
    
    data.append([img_id, dewp1, dewp2, temp1, temp2, visib1, visib2, wdsp1, wdsp2])
    

data_df = pd.DataFrame(data, columns=['img_id', 'dewp1', 'dewp2', 'temp1', 'temp2', 'visib1', 'visib2', 'wdsp1', 'wdsp2'])

In [97]:
data_df = pd.merge(data_df, df_images,  how='inner', left_on=['img_id'], right_on = ['id'])[['id', 'latitude','longitude','date_taken', 'dewp1', 'dewp2', 'temp1', 'temp2', 'visib1', 'visib2', 'wdsp1', 'wdsp2']]
data_df.head(5)

Unnamed: 0,id,latitude,longitude,date_taken,dewp1,dewp2,temp1,temp2,visib1,visib2,wdsp1,wdsp2
0,1023892839,30,-97,2007-08-05,0,1,1,0,0,1,1,2
1,1031393875,50,22,2007-08-06,1,0,1,2,0,1,1,2
2,1031706514,51,6,2007-08-06,1,0,0,1,0,1,1,2
3,1040162470,60,5,2007-08-07,1,0,1,0,0,1,1,2
4,1047202726,49,2,2007-08-07,1,0,1,0,0,1,1,2


In [98]:
weather_result = WeatherModelResult()

data_df[['score1', 'score2']] = data_df.apply(lambda r: weather_result.get_score(
        date = r['date_taken'],
        lat = r['latitude'],
        lon = r['longitude'],
        dewp1 = r['dewp1'],
        temp1 = r['temp1'],
        visib1 = r['visib1'],
        wdsp1 = r['wdsp1'],
        dewp2 = r['dewp2'],
        temp2 = r['temp2'],
        visib2 = r['visib2'],
        wdsp2 = r['wdsp2']
    ), axis=1, result_type="expand")

In [99]:
data_df

Unnamed: 0,id,latitude,longitude,date_taken,dewp1,dewp2,temp1,temp2,visib1,visib2,wdsp1,wdsp2,score1,score2
0,1023892839,30,-97,2007-08-05,0,1,1,0,0,1,1,2,0.50,0.80
1,1031393875,50,22,2007-08-06,1,0,1,2,0,1,1,2,0.50,0.65
2,1031706514,51,6,2007-08-06,1,0,0,1,0,1,1,2,0.50,0.80
3,1040162470,60,5,2007-08-07,1,0,1,0,0,1,1,2,0.50,0.50
4,1047202726,49,2,2007-08-07,1,0,1,0,0,1,1,2,0.75,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,4596863498,49,0,2010-04-10,1,0,1,0,0,1,1,2,0.50,0.50
126,4597068385,48,-122,2010-05-10,1,0,1,0,0,1,1,2,0.50,0.50
127,4600096475,37,-122,2010-05-11,0,1,2,1,0,1,1,2,0.75,0.75
128,4600939893,35,-112,2010-05-08,1,0,1,0,0,1,1,2,0.25,0.25
