In [1]:
import xarray as xr
import numpy as np
import os
import arrow
import pandas as pd

In [2]:
file_hdf = "H:/github/python/seafog/data/collection/ec201810-202205.hdf"
store_hdf = pd.HDFStore(file_hdf, mode='r')
print(store_hdf.keys())

['/df_000', '/df_003', '/df_006', '/df_009', '/df_012', '/df_015', '/df_018', '/df_021', '/df_024', '/df_027', '/df_030', '/df_033', '/df_036', '/df_039', '/df_042', '/df_045', '/df_048', '/df_051', '/df_054', '/df_057', '/df_060', '/df_063', '/df_066', '/df_069', '/df_072', '/df_078', '/df_084', '/df_090', '/df_096', '/df_102', '/df_108', '/df_114', '/df_120', '/df_126', '/df_132', '/df_138', '/df_144', '/df_150', '/df_156', '/df_162', '/df_168', '/df_174', '/df_180', '/df_186', '/df_192', '/df_198', '/df_204', '/df_210', '/df_216', '/df_222', '/df_228', '/df_234', '/df_240']


In [8]:
store_hdf.get('df_000').dtypes

visi                      float32
v100                      float32
v10m                      float32
u100                      float32
u10m                      float32
t2mm                      float32
t2md                      float32
sstk                      float32
actual_time        datetime64[ns]
init_time          datetime64[ns]
year                        int64
month                       int64
day                         int64
hour                        int64
year_sin                  float64
year_cos                  float64
day_sin                   float64
day_cos                   float64
station_vis               float64
station_rain1             float64
fc_hour                     int64
station_vis_cat             int64
dtype: object

In [3]:
fcHour_list = list(range(0, 72+1, 3)) + list(range(78, 168+1, 6))
df_list = []
for iHour in fcHour_list:
    i_df = store_hdf.get(f'df_{iHour:0>3d}')
    df_list.append(i_df)


In [4]:
df_all = pd.concat(df_list, ignore_index=True)
print(df_all)

                visi      v100      v10m      u100      u10m       t2mm  \
0                NaN -4.216614 -3.825562 -5.980148 -5.197968  26.704132   
1                NaN -1.952545 -1.704575 -8.517914 -7.353165  26.063812   
2                NaN -4.839691 -4.271530 -5.275482 -4.495422  27.667206   
3                NaN -2.585587 -2.226852 -8.449860 -7.285568  27.022125   
4                NaN -3.704269 -3.301163 -7.487183 -6.585602  26.959686   
...              ...       ...       ...       ...       ...        ...   
104288  24816.580078  1.745605  1.309647 -3.343811 -2.905838  29.508759   
104289  30597.847656  6.154083  5.305450  5.172195  4.246368  29.167603   
104290  11988.353516  1.679657  1.414215  9.496231  7.925568  28.753448   
104291  10508.394531  4.651825  4.102783  5.518433  4.695557  28.230042   
104292  16544.072266 -0.748978 -0.719742  3.466263  3.241425  28.059601   

             t2md       sstk         actual_time           init_time  ...  \
0       21.493073  28.

In [5]:
df_all.keys()

Index(['visi', 'v100', 'v10m', 'u100', 'u10m', 't2mm', 't2md', 'sstk',
       'actual_time', 'init_time', 'year', 'month', 'day', 'hour', 'year_sin',
       'year_cos', 'day_sin', 'day_cos', 'station_vis', 'station_rain1',
       'fc_hour', 'station_vis_cat'],
      dtype='object')

In [6]:
df_fog = df_all.loc[df_all['station_vis_cat'] == 0]
df_mist =  df_all.loc[df_all['station_vis_cat'] == 1]
df_clear =  df_all.loc[df_all['station_vis_cat'] == 2]

In [6]:
print(len(df_fog))
print(len(df_mist))
print(len(df_clear))

960
29001
74332


In [7]:
df_clear_sample = df_clear.sample(frac=0.5)
print(len(df_clear_sample))

37166


In [8]:
# train, valid, test的比例
# 合并 fog, mist, clear
# 随机打散
# train:valid : test = 6:1:2

partial_train = 6.0/9.0
partial_valid = 1.0/9.0
partial_test = 2.0/9.0

df_fog_train = df_fog[:int(len(df_fog)*partial_train)] # 训练集
df_fog_valid = df_fog[int(len(df_fog)*partial_train):int(len(df_fog)*partial_train)+int(len(df_fog)*partial_valid)] # 验证集
df_fog_test  = df_fog[int(len(df_fog)*partial_train)+int(len(df_fog)*partial_valid):] # 测试集

df_mist_train = df_mist[:int(len(df_mist)*partial_train)] # 训练集
df_mist_valid = df_mist[int(len(df_mist)*partial_train):int(len(df_mist)*partial_train)+int(len(df_mist)*partial_valid)] # 验证集
df_mist_test  = df_mist[int(len(df_mist)*partial_train)+int(len(df_mist)*partial_valid):] # 测试集

df_clear_train = df_clear_sample[:int(len(df_clear_sample)*partial_train)] # 训练集
df_clear_valid = df_clear_sample[int(len(df_clear_sample)*partial_train) : int(len(df_clear_sample) * partial_train) + int(len(df_clear_sample)*partial_valid)] # 验证集
df_clear_test  = df_clear_sample[int(len(df_clear_sample)*partial_train) + int(len(df_clear_sample) * partial_valid) :] # 测试集


In [9]:
df_train = pd.concat([df_fog_train, df_mist_train, df_clear_train], ignore_index=True)
df_valid = pd.concat([df_fog_valid, df_mist_valid, df_clear_valid], ignore_index=True)
df_test = pd.concat([df_fog_test, df_mist_test, df_clear_test], ignore_index=True)

In [10]:
df_train = df_train.sample(frac=1)
df_valid = df_valid.sample(frac=1)
# df_test = df_test.sample(frac=1)


In [11]:
print(len(df_train))
print(len(df_valid))
print(len(df_test))

44751
7457
14919


In [14]:
fog_dataset_hdf = "H:/github/python/seafog/data/collection/fog_dataset.hdf"
store_dataset = pd.HDFStore(fog_dataset_hdf, mode='w')
df_train.to_hdf(store_dataset, key=f'train', mode='a')
df_valid.to_hdf(store_dataset, key=f'valid', mode='a')
df_test.to_hdf( store_dataset, key=f'test' , mode='a')

df_train.to_csv('train.csv')
df_valid.to_csv('valid.csv')
df_test.to_csv('test.csv')

In [None]:
# 提取需要的变量组成x, 提取vis_cat作为y

In [13]:
x_columns = ['v100', 'v10m', 'u100', 'u10m', 't2mm', 't2md', 'sstk','year_sin','year_cos', 'day_sin', 'day_cos',]

df_train_x = df_train.loc[:, x_columns]
df_train_y = df_train['station_vis_cat']

df_valid_x = df_valid.loc[:, x_columns]
df_valid_y = df_valid['station_vis_cat']

df_test_x = df_test.loc[:, x_columns]
df_test_y = df_test['station_vis_cat']



In [14]:
train_x = df_train_x.to_numpy()
train_y = df_train_y.to_numpy()

valid_x = df_valid_x.to_numpy()
valid_y = df_valid_y.to_numpy()

test_x = df_test_x.to_numpy()
test_y = df_test_y.to_numpy()


In [15]:
print(train_x.shape)
print(train_y.shape)

(44751, 11)
(44751,)


In [12]:
df_train.to_numpy()

array([[4046.435546875, -3.0660858154296875, -2.6463775634765625, ...,
        0.0, 12, 2],
       [13515.86328125, 8.642013549804688, 5.7971038818359375, ..., 0.0,
        39, 2],
       [9900.552734375, -3.7729339599609375, -3.2535247802734375, ...,
        0.0, 24, 1],
       ...,
       [9310.4619140625, 4.059478759765625, 2.4014892578125, ..., 0.0,
        30, 1],
       [21499.46875, -3.150909423828125, -2.913055419921875, ..., 0.0,
        57, 1],
       [3296.634033203125, -0.6565093994140625, -0.273406982421875, ...,
        0.0, 45, 2]], dtype=object)