# Importing Packages

* rasterio and matplotlib packages used to open, read, and visualize S2 images

In [4]:
import rasterio as rio
import matplotlib.pyplot as plt
import numpy as np
np.set_printoptions(suppress=True)
from PIL import Image
import os
import pandas as pd
import itertools as it

# Image Preprocessing: Streamlined Functions

In [5]:
def save_cdl_img(path):
    cdl = rio.open(path)
    cdl_band = cdl.read()[0]
    cdl_im = Image.fromarray(cdl_band)
    cdl_im = cdl_im.resize((294, 331), resample=0) # nearest neighbor interpolation (can't mess with the cdl labels)
    cdl_pix = np.array(cdl_im)
    
    return cdl_pix

In [10]:
# np array data to image, running interpolation, converting back to array
def interp_pix_arrays(img_bands, band: int):
    im = Image.fromarray(img_bands[band, :, :] * 255)
    im = im.resize((294, 331), resample=3) # bicubic interpolation --> negative values; bilinear interpolation --> no negative values
    pix = np.array(im) / 255
    return pix

In [6]:
def s2_to_array(path):
    s2_10 = rio.open(path)
    s2_allbands = s2_10.read()
    shp = s2_allbands.shape

    data = []
    for band in range(0, 21): # cutting 21, 22, 23
        pix = interp_pix_arrays(s2_allbands, band)
        data.append(pix)
        
    data = np.stack(data)    
    print(data.shape)
    
    return data

In [7]:
# Saving CDL data
cdl_data = []
cdl_imgs = 'data/ee/cdl_imgs/'
for filename in os.listdir(cdl_imgs):
    if filename.endswith(".tif"): 
        path = cdl_imgs + filename
        data = save_cdl_img(path)
        cdl_data.append(data)
        continue
    else:
        continue

cdl_data = np.stack(cdl_data)
cdl_data_final = np.concatenate(cdl_data, axis=0)
print(cdl_data_final.shape)

np.save('cdl-array.npy', cdl_data_final)

(3310, 294)


In [11]:
def s2_difftimes(month):
    s2_month_data = []
    s2_imgs = 'data/ee/s2_imgs/' + month + '/'
    for filename in os.listdir(s2_imgs):
        if filename.endswith(".tif"): 
            path = s2_imgs + filename
            data = s2_to_array(path)
            s2_month_data.append(data)
            continue
        else:
            continue

    print('S2 data for month of {} compiled'.format(month))
    s2_month_data = np.stack(s2_month_data)
    
    s2_month_data = np.concatenate(s2_month_data, axis=1)
    s2_month_data = np.nan_to_num(s2_month_data)

    return s2_month_data

In [12]:
months = ['oct', 'jan', 'apr', 'jul']
data = []

for m in months:
    month_data = s2_difftimes(m)
    data.append(month_data)

s2_data = np.vstack(data)
s2_data.shape

(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
S2 data for month of oct compiled
(21, 3310, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
S2 data for month of jan compiled
(21, 3310, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
S2 data for month of apr compiled
(21, 3310, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
(21, 331, 294)
S2 data for month of jul compiled
(21, 3310, 294)


(84, 3310, 294)

In [13]:
s2_data.shape

(84, 3310, 294)

In [19]:
len(s2_data[:,0,0])

84

In [20]:
# s2_data_final = np.concatenate(s2_data, axis=1)
# np.unique(s2_data).size
np.save('training-data-draft.npy', s2_data)

# Finalize Data

In [21]:
# Load s2 data
pix_3darray = np.load('training-data-draft.npy')
pix_3darray.shape
# pix_3darray[:, 0, 0]

(84, 3310, 294)

In [22]:
# Load cdl data
cdl_array = np.load('cdl-array.npy')
np.unique(cdl_array)

array([  1,   2,   3,   4,   5,   6,  10,  12,  13,  14,  21,  23,  24,
        26,  27,  28,  29,  31,  32,  33,  35,  36,  37,  41,  42,  43,
        44,  46,  48,  49,  51,  52,  53,  57,  58,  59,  61,  66,  67,
        68,  69,  70,  74,  77, 111, 121, 122, 123, 124, 131, 141, 142,
       143, 152, 176, 190, 195, 205, 206, 208, 220, 225, 228, 229, 236,
       238, 240, 242, 246], dtype=uint8)

In [23]:
# Prep cdl data with my groupings
df = pd.read_csv('data/CDL Numerical Codes + New Groups.csv')
grps_dict = {
    'old': df['CDL Value'].tolist(),
    'new': df['New Value'].tolist()
} 

In [24]:
for count, value in enumerate(grps_dict['old']):
    cdl_array[cdl_array == value] = grps_dict['new'][count]

np.unique(cdl_array, return_counts=True)

(array([  1,   2,   3,   5,   6,  22,  44,  88, 200], dtype=uint8),
 array([191014,  46570,  61282, 191287,  16717, 160132, 132476, 146687,
         26975]))

In [27]:
rows, cols = np.mgrid[slice(pix_3darray.shape[1]), slice(pix_3darray.shape[2])]

pix_1d_rows = []
for col in range(0, 294):    
    for row in range(0, 3310):
        s2 = pix_3darray[:, row, col]
        cdl_val = [cdl_array[row, col]]
        out = np.append(s2, cdl_val, axis=0)
        # print(out.shape)
        pix_1d_rows.append(out)

len(pix_1d_rows)

973140

In [28]:
pix_2darray_final = np.stack(pix_1d_rows, axis=0)
np.unique(pix_2darray_final, return_counts=True)
np.save('training-data-final.npy', pix_2darray_final)