In [14]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
torch.cuda.is_available()

True

In [8]:
# load data
filepath = '/glade/work/psturm/ice-mp-su24/saved_trajectory_data/trajs_5100_7200_Ns10000.csv' # obin's latest csv with combined data
df = pd.read_csv(filepath)
# sort df
sort_cols = ['time', 'rk_deact']
df_sort = df.sort_values(by=sort_cols)
df_sort['volume[m3]'] = (4/3)*np.pi*(df_sort['radius_eq(ice)[m]'])**3
df_sort['mass[kg]'] = df_sort['volume[m3]']*df_sort['density(droplet/ice)[kg/m3]']
# df of current timestep
df1 = df_sort 
n_unique_ids = len(df1['rk_deact'].unique())
len_df = len(df1) - n_unique_ids
df1 = df_sort[:len_df]
df1.reset_index(inplace=True)
# df of next timestep
df2 = df_sort
df2 = df2[n_unique_ids:]
df2.reset_index(inplace=True)
# join dataframes
df = df1.join(df2, lsuffix='_1', rsuffix='_2')
df['delta_mass'] = df['mass[kg]_2'] - df['mass[kg]_1'] # add change in mass
df['delta_r'] = df['radius_eq(ice)[m]_2'] - df['radius_eq(ice)[m]_1'] # add change in radius
# split into training and test data
ids = df['rk_deact_1'].unique()
ids_train, ids_test = train_test_split(ids, test_size=0.2, random_state=666)
df_train = df[df['rk_deact_1'].isin(ids_train)]
df_test = df[df['rk_deact_1'].isin(ids_test)]

In [9]:
df_train

Unnamed: 0,level_0_1,Unnamed: 0_1,rk_deact_1,x[m]_1,y[m]_1,z[m]_1,vz[m]_1,radius(droplet)[m]_1,mass_of_aerosol_in_droplet/ice(1:01)[g]_1,radius_eq(ice)[m]_1,...,out14_2,deactrat_2,T [K]_2,RH_ice_2,RH_liquid_2,RH_diff_2,volume[m3]_2,mass[kg]_2,delta_mass,delta_r
0,4360,4360,13957,187.02308,131.37815,9362.9834,0.0,0.0,1.048439e-14,0.000013,...,14.001214,-6.621117e-09,232.110924,0.997001,0.664628,0.008388,8.598998e-15,3.603907e-12,-1.909286e-13,-2.205870e-07
1,4361,4361,150328,280.41348,123.98104,9479.3545,0.0,0.0,1.776257e-14,0.000021,...,9.352058,-2.457631e-09,231.159666,1.000650,0.660578,0.008478,3.841255e-14,1.376532e-11,-2.537302e-14,-7.098000e-09
2,4362,4362,206362,362.41535,226.61712,9679.0052,0.0,0.0,2.233854e-14,0.000026,...,6.072549,1.721107e-09,229.309070,1.003703,0.650070,0.008626,7.294642e-14,1.544887e-11,3.051831e-14,1.873300e-08
4,4364,4364,218111,123.32099,369.23415,9486.0378,0.0,0.0,8.156556e-14,0.000018,...,6.739596,-2.079506e-09,231.189258,1.001454,0.661310,0.008486,2.371593e-14,7.891735e-12,-4.648998e-14,-3.493000e-08
5,4365,4365,245083,169.88960,2.44935,9368.9605,0.0,0.0,3.741452e-14,0.000021,...,12.275088,-1.745512e-08,232.103511,0.993846,0.662474,0.008361,3.610925e-14,1.107960e-11,-6.131562e-13,-3.714710e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347055,343778,343778,1015230609,11655.36400,11895.40400,9597.7637,0.0,0.0,1.902701e-14,0.000022,...,13.616765,-6.029041e-08,229.940011,0.983420,0.641098,0.008381,3.787968e-14,7.576020e-12,-1.412306e-12,-1.221584e-06
347056,343779,343779,1015266378,11743.64800,11799.43100,9758.7617,0.0,0.0,5.646258e-15,0.000013,...,5.706291,-3.655712e-08,228.075858,0.989389,0.632679,0.008552,7.778861e-15,1.555800e-12,-2.521423e-13,-6.310640e-07
347057,343780,343780,1015305678,11771.92800,11686.20800,8977.0945,0.0,0.0,1.904751e-13,0.000087,...,0.015625,1.539362e-09,235.261644,1.017356,0.700312,0.008286,2.799120e-12,5.628925e-10,1.454879e-11,7.641930e-07
347058,340080,340080,1015329135,11473.41500,11976.99200,9335.0350,0.0,0.0,8.063607e-14,0.000056,...,19.641491,7.995893e-09,231.851583,1.010303,0.671707,0.008491,7.223066e-13,1.444621e-10,-1.412450e-12,-1.808140e-07


In [13]:
test = df_train[df_train['rk_deact_1'] == 13957]
test.shape

(35, 90)

In [15]:
# set up data
X_cols = ['RH_ice_1', 'mass[kg]_1', 'radius_eq(ice)[m]_1', 'density(droplet/ice)[kg/m3]_1',
         'rhod [kg/m3]_1', 'prs_1', 'qv_1', 'T [K]_1']
y_cols = ['delta_mass']
X_train = test[X_cols]
y_train = test[y_cols]
# remove suffix from column names
X_train.columns = X_train.columns.str.replace('_1', '')
# X_test.columns = X_test.columns.str.replace('_1', '')
# Standardize data
y_scaler = StandardScaler().set_output(transform='pandas').fit(y_train)
X_scaler = StandardScaler().set_output(transform='pandas').fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
y_train_scaled = y_scaler.transform(y_train)
# X_test_scaled = X_scaler.transform(X_test)

In [36]:
X_temp = X_train_scaled.reset_index()
y_temp = y_train_scaled.reset_index().values

In [39]:
# split a multivariate sequence past, future samples (X and y)
def split_sequences(input_sequences, output_sequence, n_steps_in, n_steps_out):
    X, y = list(), list() # instantiate X and y
    for i in range(len(input_sequences)):
        # find the end of the input, output sequence
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out - 1
        # check if we are beyond the dataset
        if out_end_ix > len(input_sequences): break
        # gather input and output of the pattern
        seq_x, seq_y = input_sequences[i:end_ix], output_sequence[end_ix-1:out_end_ix, -1]
        X.append(seq_x), y.append(seq_y)
    return np.array(X), np.array(y)

X_ss, y_mm = split_sequences(X_temp, y_temp, 5, 1)
print(X_ss.shape, y_mm.shape)

(31, 5, 9) (31, 1)


In [42]:
df_train.columns

Index(['level_0_1', 'Unnamed: 0_1', 'rk_deact_1', 'x[m]_1', 'y[m]_1', 'z[m]_1',
       'vz[m]_1', 'radius(droplet)[m]_1',
       'mass_of_aerosol_in_droplet/ice(1:01)[g]_1', 'radius_eq(ice)[m]_1',
       'radius_pol(ice)[m]_1', 'density(droplet/ice)[kg/m3]_1',
       'rhod [kg/m3]_1', 'multiplicity[-]_1', 'status[-]_1', 'index_1',
       'rime_mass[kg]_1', 'num_of_monomers[-]_1', 'time_1', 'xi gridbox_1',
       'yk gridbox_1', 'zh gridbox_1', 'time index_1', 'rh_1', 'th_1', 'prs_1',
       'qv_1', 'uinterp_1', 'vinterp_1', 'winterp_1', 'out8_1', 'out9_1',
       'out10_1', 'out11_1', 'out12_1', 'out13_1', 'out14_1', 'deactrat_1',
       'T [K]_1', 'RH_ice_1', 'RH_liquid_1', 'RH_diff_1', 'volume[m3]_1',
       'mass[kg]_1', 'level_0_2', 'Unnamed: 0_2', 'rk_deact_2', 'x[m]_2',
       'y[m]_2', 'z[m]_2', 'vz[m]_2', 'radius(droplet)[m]_2',
       'mass_of_aerosol_in_droplet/ice(1:01)[g]_2', 'radius_eq(ice)[m]_2',
       'radius_pol(ice)[m]_2', 'density(droplet/ice)[kg/m3]_2',
       'rhod

In [45]:
len(df_train['rk_deact_1'].unique())

7932