In [1]:
import iris
import os
import numpy as np
from matplotlib import pyplot as plt

iris.FUTURE.netcdf_promote = True
%matplotlib inline

In [2]:
def list_mogreps_uk(folder, years, months, days, hours, fcsts):
    fs = ['prods_op_mogreps-uk_{:04d}{:02d}{:02d}_{:02d}_00_{:03d}.nc'.format(year, month, day, hour, fcst)
            for year in years for month in months for day in days for hour in hours for fcst in fcsts]
    fs = [os.path.join(folder,f) for f in fs]
    return fs

class PrecipUpscaleDataset():
    def __init__(self, filenames, scale_factor, lat_idx, lon_idx):
        filenames.sort()
        self.filenames = filenames
        self.scale_factor = scale_factor
        self.lat_idx = lat_idx
        self.lon_idx = lon_idx

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        precip = iris.load(self.filenames[idx], 
                           'low_type_cloud_area_fraction')[0][0]
        
        low_res = np.add.reduceat(np.add.reduceat(precip.data, 
                                                  list(range(0, precip.data.shape[0], self.scale_factor))),
                                  list(range(0, precip.data.shape[1], self.scale_factor)), 
                                  axis=1) / self.scale_factor ** 2
        
        low_res_hr = low_res.repeat(self.scale_factor, 0).repeat(self.scale_factor, 1)
        val = low_res_hr[self.lat_idx][self.lon_idx]

        return val, precip[self.lat_idx][self.lon_idx].data.item()
    

class PrecipQuartilesDataset():
    def __init__(self, filenames, lat_idx, lon_idx):
        filenames.sort()
        self.filenames = filenames
        self.lat_idx = lat_idx
        self.lon_idx = lon_idx

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        precip = iris.load(self.filenames[idx], 
                           'low_type_cloud_area_fraction')[0][0]
        
        quarts = np.add.reduceat(
                    np.add.reduceat(precip.data, [0, precip.data.shape[0] * 0.5]),
                    [0, precip.data.shape[1] * 0.5], 
                    axis=1)
        quarts = quarts / (precip.shape[0] * precip.shape[1] * 0.25)

        return quarts.flatten(), precip[self.lat_idx][self.lon_idx].data
    
    
class PrecipZoomoutDataset():
    def __init__(self, filenames):
        filenames.sort()
        self.filenames = filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        precip = iris.load(self.filenames[idx], 
                           'low_type_cloud_area_fraction')[0][0]
        precip = precip[-480:, :384]
        
        lrs = []
        for scale_factor in [8,16,24,32,48]:
            low_res = np.add.reduceat(np.add.reduceat(precip.data, 
                                                      list(range(0, precip.data.shape[0], scale_factor))),
                                      list(range(0, precip.data.shape[1], scale_factor)), 
                                      axis=1) / scale_factor ** 2

            low_res_hr = low_res.repeat(scale_factor, 0).repeat(scale_factor, 1)
            val = low_res_hr
            lrs.append(val)

        return np.array(lrs + [precip.data])
    
    
class ArrayZoomoutDataset():
    def __init__(self, array, scale_factors=[8,16,24,32,48], subset=(slice(-480, None), slice(0,384))):
        self.array = array
        self.scale_factors = scale_factors
        self.subset = subset

    def __len__(self):
        return len(self.array)

    def __getitem__(self, idx):
        array = self.array[idx][self.subset]
        
        lrs = []
        for scale_factor in self.scale_factors:
            low_res = np.add.reduceat(np.add.reduceat(array, 
                                                      list(range(0, array.shape[0], scale_factor))),
                                      list(range(0, array.shape[1], scale_factor)), 
                                      axis=1) / scale_factor ** 2

            low_res_hr = low_res.repeat(scale_factor, 0).repeat(scale_factor, 1)
            val = low_res_hr
            lrs.append(val)

        return np.array(lrs)
    
    
class CloudDataset():
    def __init__(self, filenames):
        filenames.sort()
        self.filenames = filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        cube = iris.load(self.filenames[idx], 
                           'low_type_cloud_area_fraction')[0][0]

        return cube.data
    
    
class PrecipOffsetsDataset():
    def __init__(self, filenames):
        filenames.sort()
        self.filenames = filenames

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        try:
            precip = iris.load(self.filenames[idx], 
                               'low_type_cloud_area_fraction')[0][0]
            precip = precip[-540:, :420]

            lrs = []
            scale_factor = 8
            low_res = np.add.reduceat(np.add.reduceat(precip.data, 
                                                      list(range(0, precip.data.shape[0], scale_factor))),
                                      list(range(0, precip.data.shape[1], scale_factor)), 
                                      axis=1) / scale_factor ** 2

            offsets = [
                low_res,
                np.concatenate([np.zeros([1, low_res[1:].shape[1]]), low_res[1:]]),
                np.concatenate([low_res[:-1], np.zeros([1, low_res[:-1].shape[1]])]),
                np.concatenate([np.zeros([low_res[:,1:].shape[0], 1]), low_res[:,1:]], axis=1),
                np.concatenate([low_res[:,:-1], np.zeros([low_res[:,:-1].shape[0], 1])], axis=1)
            ]

            for offset in offsets:
                low_res_hr = offset.repeat(scale_factor, 0).repeat(scale_factor, 1)
                val = low_res_hr
                lrs.append(val)

            return np.array(lrs + [precip.data])
        except:
            return self.__getitem__(np.random.randint(0, len(self.filenames)))

In [5]:
c = CloudDataset(list_mogreps_uk('/home/ubuntu/sss/mogreps-uk', 
                                             [2016], 
                                             [5,6,7,8,9,10], 
                                             list(range(1,22)),
                                             [3,9,15,21],
                                             [3]))

In [6]:
ct = CloudDataset(list_mogreps_uk('/home/ubuntu/sss/mogreps-uk', 
                                             [2016], 
                                             [5,6,7,8,9,10], 
                                             list(range(24,29)),
                                             [3,9,15,21],
                                             [3]))

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [8]:
%%time
inp = list(c)
inp_array = np.stack(inp)

CPU times: user 2min 57s, sys: 1.7 s, total: 2min 59s
Wall time: 20min 34s


In [9]:
%%time
inpt = list(ct)
inpt_array = np.stack(inpt)

CPU times: user 42.5 s, sys: 380 ms, total: 42.9 s
Wall time: 4min 48s


In [10]:
np.savez_compressed('data/cloud_data', data=inp_array, test=inpt_array)