# Import packages

In [1]:
# Reading/Writing Data
import os
import glob
import numpy as np
import math 

from sklearn.ensemble import RandomForestRegressor

# Self-Defined Package
from Preprocessing import random_spatial_sequence
from SMAPDataset import SMAPDataset

# Some Utility Functions

In [2]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    np.random.seed(seed)

# Configurations
`config` contains hyper-parameters for training and the path to save your model.

In [3]:
config = {
    'seed': 123456789,      # Your seed number, you can pick your lucky number. :)
    'test_ratio': 0.1,
    'root': 'D:\\1GRADUATED\\paper\\downscaling_data\\Soil_moisture_downscale_czt\\DATASET\\'
}

# Dataloader
Read data from files and set up training, validation, and testing sets.

In [4]:
# # Set seed for reproducibility
same_seed(config['seed'])

##### 考虑到需要获取空间上均匀分布的站点数据，所以在每帧中随即挑选站点作为训练集，其余的作为验证集和测试集。因此使用一个字典来保存每天被筛选出来的站点list

In [5]:
train_sequence = {}
test_sequence = {}
full_sequence = {}
s2s_dir = config['root'] + 'LABEL\\SMAPID2INSITUID'
subdir_list = sorted(os.listdir(s2s_dir))

# 遍历指定路径下的所有子目录
for subdir in subdir_list:
    day = int(subdir.split('2015')[-1])
    if day>=121 and day<=151:
        continue
    if day>=182 and day<=243:
        continue
    subdir_path = os.path.join(s2s_dir, subdir)
    if os.path.isdir(subdir_path):
        # 如果子目录是目录而不是文件，则将子目录加入字典
        full_spatial_sequence_smap = sorted([int(f.split('.')[0]) for f in os.listdir(subdir_path) if f.endswith('.npy')]) # !!!! read out of order
        print(len(full_spatial_sequence_smap), 'of Full Spatial Sequence for', subdir, ':')
        print(full_spatial_sequence_smap)
        
        # Split the train\valid\test dataset by spatial dimension
        train_sequence[subdir], test_sequence[subdir] = random_spatial_sequence(1-config['test_ratio'], full_spatial_sequence_smap)
#         full_sequence[subdir], _ = random_spatial_sequence(1, full_spatial_sequence_smap)

10 of Full Spatial Sequence for 2015104 :
[9, 15, 17, 18, 20, 26, 29, 31, 40, 43]
**************************Data Spliting***************************
Spliting Rate:  0.9
9 of Dataset1:  [31  9 29 18 20 26 43 15 17]
1 of Dataset2:  [40]
**************************Data Spliting***************************
6 of Full Spatial Sequence for 2015106 :
[15, 16, 20, 26, 27, 29]
**************************Data Spliting***************************
Spliting Rate:  0.9
5 of Dataset1:  [20 16 29 15 27]
1 of Dataset2:  [26]
**************************Data Spliting***************************
4 of Full Spatial Sequence for 2015114 :
[9, 33, 44, 51]
**************************Data Spliting***************************
Spliting Rate:  0.9
3 of Dataset1:  [33 44  9]
1 of Dataset2:  [51]
**************************Data Spliting***************************
14 of Full Spatial Sequence for 2015115 :
[15, 16, 17, 18, 20, 26, 29, 31, 32, 33, 40, 41, 43, 44]
**************************Data Spliting***************************

In [6]:
# print(full_sequence)

In [7]:
# Initialize the dataset
train_dataset = SMAPDataset(config['root'], train_sequence)
test_dataset = SMAPDataset(config['root'], test_sequence)

***************************Load data path******************************
_______________________________2015104_______________________________
_____________________________smap cell: 31_____________________________
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015104\31.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\31.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\SM\2015104\10.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\ATI\2015104\10.npy[0m
_____________________________smap cell: 9_____________________________
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015104\9.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\9.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABE

[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015290\26.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\26.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\SM\2015290\40.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\ATI\2015290\40.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\SM\2015290\25.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\ATI\2015290\25.npy[0m
_____________________________smap cell: 45_____________________________
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015290\45.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\45.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LA

[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015280\31.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\31.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\SM\2015280\10.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\ATI\2015280\10.npy[0m
_______________________________2015282_______________________________
_____________________________smap cell: 9_____________________________
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\SMAP\2015282\9.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\INPUT\TEXTURE\9.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\SM\2015282\55.npy[0m
[32mD:\1GRADUATED\paper\downscaling_data\Soil_moisture_downscale_czt\DATASET\LABEL\ATI\2015282\55.npy[0m
_____________

In [8]:
print(len(train_dataset))
print(len(test_dataset))

203
35


# Dataset Generation

In [9]:
def calculate_standard_deviation(theta_ij, theta_bar, ati, ati_mean, ati_sd):
    return (theta_ij-theta_bar)/(ati-ati_mean)*ati_sd

In [10]:
# train_dataset = [x for x in train_dataset if '40' in x['meta_data']['insituid']]
# test_dataset = train_dataset[0:7]
# train_dataset = train_dataset[7:]

In [11]:
# 去掉站点值为0的数据
train_dataset = [x for x in train_dataset if x['label_data'][0][0]!=0]
test_dataset = [x for x in test_dataset if x['label_data'][0][0]!=0]
print(len(train_dataset))
print(len(test_dataset))

157
26


In [None]:
# 筛选某些站点 06 37 42
train_dataset = [x for x in train_dataset if x['meta_data']['insituid'][0]=='6'
                                        or x['meta_data']['insituid'][0]=='37'
                                        or x['meta_data']['insituid'][0]=='42']
test_dataset = [x for x in test_dataset if x['meta_data']['insituid'][0]=='6'
                                        or x['meta_data']['insituid'][0]=='37'
                                        or x['meta_data']['insituid'][0]=='42']
print(len(train_dataset))
print(len(test_dataset))

In [12]:
train_process_data = [x['processed_data'] for x in train_dataset]
train_label_data = [x['label_data'] for x in train_dataset]
train_meta_data = [x['meta_data'] for x in train_dataset]

test_process_data = [x['processed_data'] for x in test_dataset]
test_label_data = [x['label_data'] for x in test_dataset]
test_meta_data = [x['meta_data'] for x in test_dataset]

In [13]:
x = np.asarray([[[1.1,1.2,1.3],[2.1,2.2,2.3]],[[3.1,3.2,3.3],[4.1,4.2,4.3]]])
print(x)
print(x.reshape((4,3)))

[[[1.1 1.2 1.3]
  [2.1 2.2 2.3]]

 [[3.1 3.2 3.3]
  [4.1 4.2 4.3]]]
[[1.1 1.2 1.3]
 [2.1 2.2 2.3]
 [3.1 3.2 3.3]
 [4.1 4.2 4.3]]


In [14]:
X_train = [x.ravel() for x in train_process_data]
X_train = np.asarray(X_train)
print(X_train.shape)

train_sd_list = []
for data_list in train_label_data:
    sd_i_list = []
    for data in data_list:
        sd = calculate_standard_deviation(data[0][0], data[1][0], data[2][0], data[2][1], data[2][2])
        train_sd_list.append(sd)
        # 只取一个站点
        break
#     train_sd_list.append(sd_i_list)
train_sd_list = np.asarray(train_sd_list)
print(train_sd_list)

(157, 2299)
[-7.87583345e-01  4.03777750e-01 -1.03470827e-01  9.50151604e-02
 -9.04444448e-02 -1.53884065e-01 -5.81478058e-01  1.99460778e-01
 -3.09556265e-02  1.12752355e+00  3.54027636e-02 -2.48613393e-01
 -8.49563676e-02  5.49586911e-03  6.02822954e-01 -9.27167267e-03
  4.36423046e-02 -1.10147217e-01  1.58852464e-02 -9.02380475e-02
 -1.14459697e-03 -1.43191197e-01 -1.79203588e+00  4.71677299e-01
  6.13055663e-02 -4.63703379e-02  1.62178584e-01 -8.43111090e-01
  6.66583645e-01  3.20070859e+01  1.39939516e-01 -2.18588841e-01
  2.70505609e-02  6.89989718e-03 -1.64530291e-01  6.19198966e-01
 -5.74009476e+00  3.48188992e-02  2.29418943e-01 -3.51789456e-01
 -1.75137200e-01  5.23156703e-01  4.32193146e-02  3.42526676e-02
 -3.73244162e-01  4.01356857e-02 -9.44572387e-03 -1.80555891e-01
 -3.69425137e-01  5.29255669e-02 -3.50416147e-02 -1.56733779e-01
 -7.33458374e-02 -3.83645619e-02 -6.50310543e-02  3.98281342e-01
  5.11289694e-01 -2.14572705e-01  1.56193112e-01 -2.08352482e-02
 -1.74850605e

In [15]:
X_test = [x.ravel() for x in test_process_data]
X_test = np.asarray(X_test)
print(X_test.shape)

test_sd_list = []
for data_list in test_label_data:
    sd_i_list = []
    for data in data_list:
        sd = calculate_standard_deviation(data[0][0], data[1][0], data[2][0], data[2][1], data[2][2])
        test_sd_list.append(sd)
        break
#     test_sd_list.append(sd_i_list)
test_sd_list = np.asarray(test_sd_list)
print(test_sd_list.shape)

(26, 2299)
(26,)


# Start training!

In [36]:
rf = RandomForestRegressor(n_estimators=1, oob_score=False)
rf.fit(X_train, train_sd_list)
train_accuracy = rf.score(X_train, train_sd_list)
print(train_accuracy)

0.6461989650057324


  warn("Some inputs do not have OOB scores. "


## Testing

In [37]:
y_pred = rf.predict(X_test)
accuracy = rf.score(X_test, test_sd_list)
print(accuracy)
print(y_pred)

-266.6076352614256
[-2.96150239e-01 -1.06206401e-02 -3.83317800e-02 -1.06206401e-02
 -1.34482400e-01  4.71677299e-01 -1.73125405e-01  8.05201768e-02
 -2.48439977e-01 -5.74009476e+00 -2.96150239e-01  6.89989718e-03
  7.40374430e-02 -3.09556265e-02 -7.87583345e-01 -2.71882259e-01
  3.98281342e-01 -3.83317800e-02 -1.54318516e+00  3.20070859e+01
 -1.83429431e-02 -3.09556265e-02 -1.34482400e-01  3.98281342e-01
 -1.19009789e-01 -3.83645619e-02]
