In [2]:
import os
import xarray as xr
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import time

# Some constants:

In [3]:
data_dir = '/home/hawbecker/kaggle/LEAP'

frame_size = 384 # From the test file
total_len = 625000 # size of training set
train_size = 400000 # amount of data from training set we use for training
test_size = total_len - train_size
batch_size = frame_size * 10

train_data_file = os.path.join(data_dir,'train.csv')
test_data_file = os.path.join(data_dir,'test.csv')
submission_file = os.path.join(data_dir,'sample_submission.csv')

# Load in the data:

In [4]:
#### Sample data:
sample_ds = xr.open_dataset(os.path.join(data_dir,'ClimSim_low-res_grid-info.nc'))

lat = sample_ds.lat
lon = sample_ds.lon
ak = sample_ds.hyam
bk = sample_ds.hybm
stride = len(lat)

In [5]:
# Sample 100 rows of data to determine dtypes.
df_test = pd.read_csv(train_data_file, nrows=100)

float_cols = [c for c in df_test if df_test[c].dtype == "float64"]
float32_cols = {c: np.float32 for c in float_cols}

# Get the 2D and 3D variable names:

In [6]:
# Get 2d / 3d variables:
with open(train_data_file,'r') as f:
    header = f.readline().split(',')

Xvars_2d = []
Xvars_3d = []

Yvars_2d = []
Yvars_3d = []

for col in header[1:]:
    col = col.replace('\n','')
    col = col.split('_')
    if (len(col) == 3) & (col[-1].isnumeric()): # 3-D
        if col[0] == 'ptend': # Targets
            Yvars_3d += ['_'.join(col[:2])]
        else: # Features
            Xvars_3d += ['_'.join(col[:2])]
    else: # 2-D
        if col[1] == 'out': # Targets:
            Yvars_2d += ['_'.join(col)]
        else: # Features
            Xvars_2d += ['_'.join(col)]
Xvars_3d = list(np.unique(Xvars_3d))
Yvars_3d = list(np.unique(Yvars_3d))
print(Xvars_3d)
print(Yvars_3d)

['pbuf_CH4', 'pbuf_N2O', 'pbuf_ozone', 'state_q0001', 'state_q0002', 'state_q0003', 'state_t', 'state_u', 'state_v']
['ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_t', 'ptend_u', 'ptend_v']


In [7]:
print(Xvars_2d)
print(Yvars_2d)

['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX', 'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHLAND']
['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC', 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']


In [8]:
vars_3d = Xvars_3d + Yvars_3d
vars_2d = Xvars_2d + Yvars_2d

# Convert the data to columns:

In [13]:
def get_point(point_df,point_count,x_or_y=None):
    p0 = 100000
    p_levels = 60
    plvls = []
    for pp in range(p_levels):
        plvl = float(ak[pp])*p0 + float(bk[pp])*point_df.state_ps
        point_df['state_p_{}'.format(pp)] = plvl
    print(point_df['state_P_0'])
    wefwef
    empty3d = np.zeros([p_levels])

    # 3-D Variables:
    init_df = True
    
    for varn in (vars_3d + ['state_p_']):
        data = empty3d.copy()
        for col in point_df.columns:
            if varn in col:
                data_lvl = point_df[col]
                lvl = int(col.split('_')[-1])
                data[lvl] = data_lvl.values
        data_da = xr.DataArray(data=data,name=varn,dims={'level':np.arange(0,p_levels)})
        if init_df:
            data_ds = data_da
            init_df = False
        else:
            data_ds = xr.merge([data_ds,data_da])

    # 2-D Variables:
    for col in vars_2d:
        #data = point_df[col].values
        # Fill 2-D data so it looks 3D for convolution:
        if col in point_df.columns:
            data = np.ones(np.shape(empty3d))*point_df[col].values
            data_da = xr.DataArray(data=data,name=col,dims={'level':np.arange(0,p_levels)})
            #data_da = xr.DataArray(data=data,name=col,dims={'point':np.arange(0,point_size)})
            data_ds = xr.merge([data_ds,data_da])

    data_ds = data_ds.expand_dims({'point':[point_count]})
    if x_or_y == 'x':
        data_ds = data_ds[Xvars_3d + Xvars_2d]
    else:
        data_ds = data_ds[Yvars_3d + Yvars_2d]
        
    return(data_ds)

In [43]:
def process_data(df,get_y=True,save_dir='./',batch_num=None):

    for ll,loc in enumerate(df.index):
        point_df = pd.DataFrame(df.loc[loc]).T
        if ll == 0:
            X_ds = get_point(point_df,loc,x_or_y='x')
            if get_y:
                Y_ds = get_point(point_df,loc,x_or_y='y')
        else:
            X_ds = xr.merge([X_ds,get_point(point_df,loc,x_or_y='x')])
            if get_y:
                Y_ds = xr.merge([Y_ds,get_point(point_df,loc,x_or_y='y')])

    if get_y:
        file_name = os.path.join(save_dir,'train_data_')
        X_ds.to_netcdf(file_name + 'X_{0:04d}.nc'.format(batch_num))
        Y_ds.to_netcdf(file_name + 'Y_{0:04d}.nc'.format(batch_num))
    else:
        file_name = os.path.join(save_dir,'test_data_')
        X_ds.to_netcdf(file_name + 'X_{0:04d}.nc'.format(batch_num))
                
    if get_y:
        return(X_ds,Y_ds)
    else:
        return(X_ds)

# Process the data:

### Train data:

In [56]:
max_locs = 5000
for ss,skip_rows in enumerate(range(0,total_len,max_locs)):
    got_xy = [False,False]
    for dd,dim in enumerate(['X','Y']):
        save_dir = os.path.join(data_dir,'processed_data')
        file_name = os.path.join(save_dir,'train_data_{0}_{1:04d}.nc'.format(dim,ss))
        if os.path.exists(file_name):
            got_xy[dd] = True
    if not np.all(got_xy):
        print('Loading data batch: {}/{}'.format(ss+1,int(total_len/max_locs)))
        load_s = time.time()
        train_df = pd.read_csv(train_data_file, engine='c', 
                               header=0,
                               dtype=float32_cols, 
                               nrows=max_locs,
                               skiprows=range(1,skip_rows+1))
        train_df.index += skip_rows
        load_e = time.time()
        load_time = load_e - load_s
        print('\t\tLoaded in {}s.\n\t\tStart processing...'.format(load_time))
        train_df = train_df[train_df.columns[1:]]
        last_X,last_Y = process_data(train_df,
                                     save_dir=os.path.join(data_dir,'processed_data'),
                                     batch_num=ss)
        proc_e = time.time()
        proc_time = proc_e - load_e
        print('\t\tProcessed in {}s'.format(proc_time))
    print('{} already generated'.format(file_name))

/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0000.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0001.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0002.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0003.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0004.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0005.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0006.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0007.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0008.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0009.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0010.nc already generated
/home/hawbecker/kaggle/LEAP/processed_data/train_data_Y_0011.nc already generated
/home/hawbecker/

### Test data

In [11]:
total_len_test = -1
with open(test_data_file,'r') as f:
    for line in f: total_len_test += 1

In [12]:
total_len_test

625000

In [40]:
def get_point(point_df,point_count,x_or_y=None):
    p0 = 100000
    p_levels = 60
    plvls = []
    for pp in range(p_levels):
        plvl = float(ak[pp])*p0 + float(bk[pp])*point_df.state_ps
        point_df['state_p_{}'.format(pp)] = plvl

    empty3d = np.zeros([p_levels])
    # 3-D Variables:
    init_df = True
    
    for varn in (vars_3d + ['state_p_']):
        data = empty3d.copy()
        for col in point_df.columns:
            if varn in col:
                data_lvl = point_df[col]
                lvl = int(col.split('_')[-1])
                data[lvl] = data_lvl.values
        if varn[-1] == '_': varn = varn[:-1]
        data_da = xr.DataArray(data=data,name=varn,dims={'level':np.arange(0,p_levels)})
        if init_df:
            data_ds = data_da
            init_df = False
        else:
            data_ds = xr.merge([data_ds,data_da])

    # 2-D Variables:
    for col in vars_2d:
        #data = point_df[col].values
        # Fill 2-D data so it looks 3D for convolution:
        if col in point_df.columns:
            data = np.ones(np.shape(empty3d))*point_df[col].values
            data_da = xr.DataArray(data=data,name=col,dims={'level':np.arange(0,p_levels)})
            #data_da = xr.DataArray(data=data,name=col,dims={'point':np.arange(0,point_size)})
            data_ds = xr.merge([data_ds,data_da])

    data_ds = data_ds.expand_dims({'point':[point_count]})
    if x_or_y == 'x':
        data_ds = data_ds[Xvars_3d + Xvars_2d + ['state_p']]
    else:
        data_ds = data_ds[Yvars_3d + Yvars_2d]

    return(data_ds)

In [42]:
file_name

'/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0000.nc'

In [47]:
max_locs = 5000
for ss,skip_rows in enumerate(range(0,total_len,max_locs)):
    got_x = [False]
    for dd,dim in enumerate(['X']):
        save_dir = os.path.join(data_dir,'processed_data')
        file_name = os.path.join(save_dir,'test_data_{0}_{1:04d}.nc'.format(dim,ss))
        if os.path.exists(file_name):
            got_x[dd] = True
    if not np.all(got_x):
        print('Loading data batch: {}/{}'.format(ss+1,int(total_len_test/max_locs)))
        load_s = time.time()
        test_df = pd.read_csv(test_data_file, engine='c', 
                               header=0,
                               dtype=float32_cols, 
                               nrows=max_locs,
                               skiprows=range(1,skip_rows+1))
        test_df.index += skip_rows
        load_e = time.time()
        load_time = load_e - load_s
        print('\t\tLoaded in {}s.\n\t\tStart processing...'.format(load_time))
        test_df = test_df[test_df.columns[1:]]
        last_X = process_data(test_df,
                              get_y=False,
                              save_dir=os.path.join(data_dir,'processed_data'),
                              batch_num=ss)
        proc_e = time.time()
        proc_time = proc_e - load_e
        print('\t\tProcessed in {}s'.format(proc_time))
    print('{} already generated'.format(file_name))

/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0000.nc already generated
Loading data batch: 2/125
		Loaded in 0.43969178199768066s.
		Start processing...
		Processed in 364.28883719444275s
/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0001.nc already generated
Loading data batch: 3/125
		Loaded in 0.5694880485534668s.
		Start processing...
		Processed in 346.95068645477295s
/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0002.nc already generated
Loading data batch: 4/125
		Loaded in 0.7698884010314941s.
		Start processing...
		Processed in 344.6807074546814s
/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0003.nc already generated
Loading data batch: 5/125
		Loaded in 0.7953975200653076s.
		Start processing...
		Processed in 345.64590096473694s
/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0004.nc already generated
Loading data batch: 6/125
		Loaded in 0.9725310802459717s.
		Start processing...
		Processed in 343.4448938369751s
/home/hawbecker/

In [45]:
file_name

'/home/hawbecker/kaggle/LEAP/processed_data/test_data_X_0001.nc'