# Import packages

In [1]:
# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# Some Utility Functions

You do not need to modify this part.

In [2]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [3]:
def random_spatial_sequence(split_rate, full_sequence):
    counts = len(full_sequence)
    len1 = int(counts * split_rate)
    len2 = counts - len1
    index1 = np.random.choice(full_sequence, len1, replace=False)
    index2 = np.setdiff1d(full_sequence, index1)
    print('**************************Data Spliting***************************')
    print('Spliting Rate: ', split_rate)
    print(len1, 'of Dataset1: ',index1)
    print(len2, 'of Dataset2: ',index2)
    print('**************************Data Spliting***************************')
    return index1, index2

In [4]:
# spatial_sequence = [2, 8, 9, 10, 11, 12, 15, 20, 25, 30, 34, 40, 44]
# print(len(spatial_sequence))
# random_spatial_sequence(0.7, spatial_sequence)

In [5]:
def collate_fn(batch):
#     print(batch)
#     print(len(batch[0]))
    # 从每个样本的字典中获取处理结果、标签和其他数据，并将它们存储在同一个字典中
    processed_data = [torch.FloatTensor(sample["processed_data"]) for sample in batch]
    label_data = [sample["label_data"] for sample in batch]
    
    # 将所有需要传递的数据都保存在同一个字典中
    batch_dict = {"processed_data": torch.stack(processed_data),
                  "label_data": label_data}
    
    return batch_dict

In [6]:
def calculate_upscaling_sm(sm_bar, sm_bar_sd, ati, ati_bar, ati_bar_sd):
    return sm_bar + sm_bar_sd * (ati - ati_bar) / ati_bar_sd

In [7]:
def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    for x in tqdm(test_loader):
        x = x.to(device)                        
        with torch.no_grad():                   
            pred = model(x)                     
            preds.append(pred.detach().cpu())   
    preds = torch.cat(preds, dim=0).numpy()  
    return preds

# Dataset

In [8]:
class SMAPDataset(Dataset):
    '''
    root: root of input data
    temperal_sequence: the sequence of valid day
    spatial_sequence: the sequence of valid SMAPID
    test: the flag to identify if it is the test dataset
    '''
    def __init__(self, root, temperal_sequence, spatial_sequence):
        # variables for input
        self.smap = []
        self.texture = []
        
        # variables for output
        self.sm = []
#         self.smap_unorm = []
        self.ati = [] # contains [ati, atim, atisd] in each element
        
        print('***************************Load data path******************************')
        for i in temperal_sequence: # for example: 20151015
            print('_______________________________' + str(i) + '_______________________________')
            for j in spatial_sequence: # for example: 1
                print('_____________________________smap cell: ' + str(j) + '_____________________________')
                # add path for input variables
                self.smap.append(root + 'INPUT\\SMAP\\' + i + '\\' + str(j) + '.npy')
                self.texture.append(root + 'INPUT\\TEXTURE\\' + str(j) + '.npy')
                # display adding path
                print((root + 'INPUT\\SMAP\\' + i + '\\' + str(j) + '.npy'))
                print((root + 'INPUT\\TEXTURE\\' + str(j) + '.npy'))
#                 print(os.path.exists(root + 'INPUT\\SMAP\\' + i + '\\' + str(j) + '.npy'))
#                 print(os.path.exists(root + 'INPUT\\TEXTURE\\' + str(j) + '.npy'))
                      
                # one smap to many in-situ sm
                smap_to_insitu = np.load(root + "LABEL\\SMAPID2INSITUID\\" + str(j) + '.npy')
                insitu_sm_list = []
                insitu_ati_list = []
                for _id in smap_to_insitu:
                    insitu_sm_list.append(root + "LABEL\\SM\\" + i + "\\" + str(_id) + ".npy")
                    insitu_ati_list.append(root + "LABEL\\ATI\\" + i + "\\" + str(_id) + ".npy")
                    # display adding path
                    print((root + "LABEL\\SM\\" + i + "\\" + str(_id) + ".npy"))
                    print((root + "LABEL\\ATI\\" + i + "\\" + str(_id) + ".npy"))
#                     print(os.path.exists(root + "LABEL\\SM\\" + i + "\\" + str(_id) + ".npy"))
#                     print(os.path.exists(root + "LABEL\\ATI\\" + i + "\\" + str(_id) + ".npy"))
                      
                # add the data of insitu in insitu_list
                self.sm.append(insitu_sm_list)
                self.ati.append(insitu_ati_list)    
                      
    def __getitem__(self, idx):
        smap_path = self.smap[idx]
        texture_path = self.texture[idx]
        smap = np.load(smap_path)
        texture = np.load(texture_path)
        
        data_pkg = {'processed_data': [], 'label_data': []}
        
        # choose flatten as the way to concatenate the input feature
        x = self.__flatten__(smap, texture)
        data_pkg['processed_data'] = x
        
        sm_list = self.sm[idx]
        ati_list = self.ati[idx]
        y = [] # y -> [[sm, smap, ati], ...], sm -> [float], smap -> [float],1 ati -> [ati, atim, atisd]
        for i in range(len(sm_list)):
            sm_path = sm_list[i]
            ati_path = ati_list[i]
            sm = np.load(sm_path, allow_pickle=True)
            ati = np.load(ati_path, allow_pickle=True)
            data_pkg['label_data'].append([sm, smap, ati])      # other_data -> [[sm, smap, ati], ...], 
                                                                    # sm -> [float]
                                                                    # smap -> [float], 
                                                                    # ati -> [ati, atim, atisd]
            
            # test: each element in list of batch should be of equal size
#             break
        return data_pkg

    def __len__(self):
        return len(self.smap)

    ### the way to concatenate input data
    def __flatten__(self, smap, texture):
        # normalization is done before loading
        texture_flat = texture.flatten()
        return  np.concatenate((smap, texture_flat), axis=0)
    
    def get_input_shape(self, idx):
        data_pkg = self.__getitem__(0)
        return data_pkg['processed_data'].shape

In [9]:
# def flatten(smap, texture):
#     texture_flat = np.ravel(texture)
#     print(texture_flat)
#     return  np.concatenate((smap, texture_flat), axis=0)

# root = 'E:\\downscaling\\Soil_moisture_downscale_czt\\DATASET\\INPUT\\'
# smap = np.load(root + 'SMAP\\20151015\\0.npy')
# texture = np.load(root + 'TEXTURE\\0.npy')
# print(smap.shape)
# print(texture.shape)
# print(flatten(smap, texture).shape)

In [10]:
# root = 'E:\\downscaling\\Soil_moisture_downscale_czt\\DATASET\\'
# directory = root + 'LABEL\\SMAPID2INSITUID\\'
# full_spatial_sequence_smap = sorted([int(f.split('.')[0]) for f in os.listdir(directory) if f.endswith('.npy')]) # !!!! read out of order
# print(len(full_spatial_sequence_smap), 'of Full Spatial Sequence: ', full_spatial_sequence_smap)

# train_spatial_seq, test_spatial_seq = random_spatial_sequence(0.7, full_spatial_sequence_smap)
# train_spatial_seq, valid_spatial_seq = random_spatial_sequence(0.8, train_spatial_seq)

# temperal_seq = ['20151015']
# train_dataset = SMAPDataset(root, temperal_seq, train_spatial_seq)
# valid_dataset = SMAPDataset(root, temperal_seq, valid_spatial_seq)
# test_dataset = SMAPDataset(root, temperal_seq, test_spatial_seq)
# train_dataset.get_input_shape()[0]

# Neural Network Model
Try out different model architectures by modifying the class below.

In [11]:
class My_Model(nn.Module):
    def __init__(self, input_dim):
        super(My_Model, self).__init__()
        # TODO: modify model's structure, be aware of dimensions. 
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1) # (B, 1) -> (B)
        return x

# Self-defined Loss Function


In [12]:
def my_Loss(pred, label_data):
    criterion = nn.MSELoss(reduction='mean')
    loss = 0
    for i, y in enumerate(pred):      # each calculation in a batch
        # calculate the high resolution sm for each in situ
        for situ_pkg in label_data[i]: # each calculation in a smap
            sm_situ = situ_pkg[0][0]
            sm_bar = situ_pkg[1][0]
            ati =  situ_pkg[2][0]
            atim = situ_pkg[2][1]
            atisd = situ_pkg[2][2]
            sm_pred = calculate_upscaling_sm(sm_bar, y, ati, atim, atisd)
            _loss = criterion(sm_pred, torch.FloatTensor([sm_situ]))
            loss = (loss + _loss)
    loss = loss / config['batch_size']
    return loss

# Training Loop

In [13]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = my_Loss # Define your loss function, do not modify this.
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learning_rate'], momentum=0.9) 

    if not os.path.isdir(config['root'] + '\\OUTPUT\\MODELS\\'):
        os.mkdir(config['root'] + '\\OUTPUT\\MODELS\\') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        for data_pkg in train_loader:
            x = data_pkg['processed_data']
            optimizer.zero_grad()               # Set gradient to zero.
#             x, y = x.to(device), y.to(device)   # Move your data to device. 
            pred = model(x)
            loss = criterion(pred, data_pkg['label_data'])
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())
            
        mean_train_loss = sum(loss_record)/len(loss_record)
#         print('Mean Loss for Epoch {}: {}'.format(epoch, mean_train_loss))

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for data_pkg in valid_loader:
            x = data_pkg['processed_data']
#             x, y = x.to(device), y.to(device)   # Move your data to device. 
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, data_pkg['label_data'])
            loss_record.append(loss.item())
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['root'] + '\\OUTPUT\\MODELS\\' + str(best_loss) + '.ckpt') # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

# Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 5201314,      # Your seed number, you can pick your lucky number. :)
    'test_ratio': 0.3,
    'valid_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 30,     # Number of epochs.            
    'batch_size': 5, 
    'learning_rate': 1e-5,              
    'early_stop': 10,    # If model has not improved for this many consecutive epochs, stop training.     
    'root': 'D:\\1GRADUATED\\paper\\downscaling_data\\Soil_moisture_downscale_czt\\DATASET\\'
}

# Dataloader
Read data from files and set up training, validation, and testing sets. You do not need to modify this part.

In [16]:
# Set seed for reproducibility
same_seed(config['seed'])

# Get the id sequence of all valid SMAP
directory = config['root'] + 'LABEL\\SMAPID2INSITUID\\'
full_spatial_sequence_smap = sorted([int(f.split('.')[0]) for f in os.listdir(directory) if f.endswith('.npy')]) # !!!! read out of order
print(len(full_spatial_sequence_smap), 'of Full Spatial Sequence: ', full_spatial_sequence_smap)

# Split the train\valid\test dataset by spatial dimension
train_spatial_seq, test_spatial_seq = random_spatial_sequence(1-config['test_ratio'], full_spatial_sequence_smap)
train_spatial_seq, valid_spatial_seq = random_spatial_sequence(1-config['valid_ratio'], train_spatial_seq)

# Initialize the dataset
temperal_seq = ['20151015']
train_dataset = SMAPDataset(config['root'], temperal_seq, train_spatial_seq)
valid_dataset = SMAPDataset(config['root'], temperal_seq, valid_spatial_seq)
test_dataset = SMAPDataset(config['root'], temperal_seq, test_spatial_seq)

# Pytorch data loader loads pytorch dataset into batches.
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True, collate_fn=collate_fn)

21 of Full Spatial Sequence:  [2, 8, 9, 10, 15, 16, 17, 18, 20, 26, 27, 29, 31, 32, 33, 40, 41, 43, 44, 45, 51]
**************************Data Spliting***************************
Spliting Rate:  0.7
14 of Dataset1:  [ 9 45 31 40 16 27 18 32 43 29 33 44 10 26]
7 of Dataset2:  [ 2  8 15 17 20 41 51]
**************************Data Spliting***************************
**************************Data Spliting***************************
Spliting Rate:  0.8
11 of Dataset1:  [27 31 16 18 44 45 33 40 26 10 29]
3 of Dataset2:  [ 9 32 43]
**************************Data Spliting***************************
***************************Load data path******************************
_______________________________20151015_______________________________
_____________________________smap cell: 27_____________________________
D:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\20151015\27.npy
D:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE

# Start training!

In [17]:
model = My_Model(input_dim=train_dataset.get_input_shape(0)[0]).to(device) # put your model and data on the same computation device.
print(train_dataset.get_input_shape(0)[0])
print(model)
trainer(train_loader, valid_loader, model, config, device)

1937
My_Model(
  (layers): Sequential(
    (0): Linear(in_features=1937, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU()
    (4): Linear(in_features=8, out_features=1, bias=True)
  )
)
Epoch [1/30]: Train loss: 0.3157, Valid loss: 0.3761
Saving model with loss 0.376...
Epoch [2/30]: Train loss: 0.3157, Valid loss: 0.3760
Saving model with loss 0.376...
Epoch [3/30]: Train loss: 0.3157, Valid loss: 0.3760
Saving model with loss 0.376...
Epoch [4/30]: Train loss: 0.3157, Valid loss: 0.3758
Saving model with loss 0.376...
Epoch [5/30]: Train loss: 0.3157, Valid loss: 0.3757
Saving model with loss 0.376...
Epoch [6/30]: Train loss: 0.3156, Valid loss: 0.3755
Saving model with loss 0.376...
Epoch [7/30]: Train loss: 0.3156, Valid loss: 0.3754
Saving model with loss 0.375...
Epoch [8/30]: Train loss: 0.3156, Valid loss: 0.3753
Saving model with loss 0.375...
Epoch [9/30]: Train loss: 0.3155, Valid loss: 0.3751
Saving mode

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [11/30]: Train loss: 0.3154, Valid loss: 0.3749
Saving model with loss 0.375...
Epoch [12/30]: Train loss: 0.3154, Valid loss: 0.3748
Saving model with loss 0.375...
Epoch [13/30]: Train loss: 0.3154, Valid loss: 0.3747
Saving model with loss 0.375...
Epoch [14/30]: Train loss: 0.3153, Valid loss: 0.3746
Saving model with loss 0.375...
Epoch [15/30]: Train loss: 0.3153, Valid loss: 0.3745
Saving model with loss 0.374...
Epoch [16/30]: Train loss: 0.3152, Valid loss: 0.3744
Saving model with loss 0.374...
Epoch [17/30]: Train loss: 0.3152, Valid loss: 0.3742
Saving model with loss 0.374...
Epoch [18/30]: Train loss: 0.3152, Valid loss: 0.3741
Saving model with loss 0.374...
Epoch [19/30]: Train loss: 0.3151, Valid loss: 0.3739
Saving model with loss 0.374...
Epoch [20/30]: Train loss: 0.3151, Valid loss: 0.3738
Saving model with loss 0.374...
Epoch [21/30]: Train loss: 0.3151, Valid loss: 0.3737
Saving model with loss 0.374...
Epoch [22/30]: Train loss: 0.3150, Valid loss: 0.3736


# Testing
The predictions of your model on testing set will be stored at `pred.csv`.

In [18]:
def save_pred(preds, file):
    ''' Save predictions to specified file '''
    with open(file, 'w') as fp:
        writer = csv.writer(fp)
        writer.writerow(['id', 'tested_positive'])
        for i, p in enumerate(preds):
            writer.writerow([i, p])

model = My_Model(input_dim=x_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, device) 
save_pred(preds, 'pred.csv')         

NameError: name 'x_train' is not defined

# Reference
This notebook uses code written by Heng-Jui Chang @ NTUEE (https://github.com/ga642381/ML2021-Spring/blob/main/HW01/HW01.ipynb)