# Train-val-test split

In [1]:
import numpy as np

In [2]:
# PREPROCESSED FILE PATHS 

# interpretation
INTERPRETATION_VOL_PATH_PREPROCESSED = '../data/preprocessed/f3_interpretation/inline_vol.npy'
INTERPRETATION_LABEL_PATH_PREPROCESSED = '../data/preprocessed/f3_interpretation/inline_label.npy'

# faciesmark dataset 
FACIESMARK_VOL_PATH_PREPROCESSED = '../data/preprocessed/faciesmark/raw/seismic_entire_volume.npy'
FACIESMARK_LABEL_PATH_PREPROCESSED = '../data/preprocessed/faciesmark/raw/labels_entire_volume.npy'


# stdata12 dataset 
STDATA_VOL_PATH_PREPROCESSED = '../data/preprocessed/stdata12/stdata_12_amplitude.npy'
STDATA_LABEL_PATH_PREPROCESSED = '../data/preprocessed/stdata12/stdata_12_labels.npy'

In [3]:

def summary_stats(vol) : 
    print(f'''
    Shape : {vol.shape}
    Mean : {np.mean(vol)}
    Stdev : {np.std(vol)}
    Min : {np.min(vol)}
    Median : {np.median(vol)}
    99th Percentile of Abs : {np.quantile(np.abs(vol), 0.99)}
    Max : {np.max(vol)}
    Null values : {np.sum(np.isnan(vol))}
    
    ''')



In [4]:
split_factor = [0.80,0.10,0.10]


def split_dset(vol, label,split_factor) : 

    nlines = vol.shape[0]

    train_inlines_end = int(split_factor[0] * nlines)
    val_inlines_range = [train_inlines_end,train_inlines_end + int(nlines * split_factor[1])  ]
    test_inlines_start = val_inlines_range[-1]

    train_dset = vol[:train_inlines_end]
    train_labels = label[:train_inlines_end]

    val_dset = vol[val_inlines_range[0] : val_inlines_range[1]]
    val_labels = label[val_inlines_range[0] : val_inlines_range[1]]

    test_dset = vol[test_inlines_start :]
    test_labels = label[test_inlines_start :]

    print(f'''
    Dataset splits : 
    Train Vol : {train_dset.shape}
    Train_Labels : {train_labels.shape}

    Val Vol : {val_dset.shape}
    Val Labels : {val_labels.shape}

    Test Vol : {test_dset.shape}
    Test Labels : {test_labels.shape}
    ''')

    return train_dset, train_labels, val_dset, val_labels, test_dset, test_labels




## INTERPRETATION

In [5]:
vol_int = np.load(INTERPRETATION_VOL_PATH_PREPROCESSED)
label_int = np.load(INTERPRETATION_LABEL_PATH_PREPROCESSED)

summary_stats(vol_int)


    Shape : (601, 951, 361)
    Mean : 0.002223885850980878
    Stdev : 0.1940867155790329
    Min : -0.9734795689582825
    Median : 0.0035521062090992928
    99th Percentile of Abs : 0.43072309792041796
    Max : 1.0
    Null values : 0
    
    


In [6]:
train_dset, train_labels, val_dset, val_labels, test_dset, test_labels = split_dset(vol_int, label_int,split_factor)


    Dataset splits : 
    Train Vol : (480, 951, 361)
    Train_Labels : (480, 951, 361)

    Val Vol : (60, 951, 361)
    Val Labels : (60, 951, 361)

    Test Vol : (61, 951, 361)
    Test Labels : (61, 951, 361)
    


In [7]:
import os 
INTERPRETATION_DIR = '../data/preprocessed/f3_interpretation'

np.save(os.path.join(INTERPRETATION_DIR, 'train_inline_vol.npy'), train_dset)
np.save(os.path.join(INTERPRETATION_DIR, 'train_inline_labels.npy'), train_labels)

np.save(os.path.join(INTERPRETATION_DIR, 'val_inline_vol.npy'), val_dset)
np.save(os.path.join(INTERPRETATION_DIR, 'val_inline_labels.npy'), val_labels)

np.save(os.path.join(INTERPRETATION_DIR, 'test_inline_vol.npy'), test_dset)
np.save(os.path.join(INTERPRETATION_DIR, 'test_inline_labels.npy'), test_labels)

## FACIESMARK

In [11]:
vol_f = np.load(FACIESMARK_VOL_PATH_PREPROCESSED)
label_f= np.load(FACIESMARK_LABEL_PATH_PREPROCESSED)

split_factor = [0.8,0.1,0.1]

summary_stats(vol_f)

train_dset, train_labels, val_dset, val_labels, test_dset, test_labels = split_dset(vol_f, label_f,split_factor)

FACIESMARK_DIR = '../data/preprocessed/faciesmark'

np.save(os.path.join(FACIESMARK_DIR, 'train_inline_vol.npy'), train_dset)
np.save(os.path.join(FACIESMARK_DIR, 'train_inline_labels.npy'), train_labels)

np.save(os.path.join(FACIESMARK_DIR, 'val_inline_vol.npy'), val_dset)
np.save(os.path.join(FACIESMARK_DIR, 'val_inline_labels.npy'), val_labels)

np.save(os.path.join(FACIESMARK_DIR, 'test_inline_vol.npy'), test_dset)
np.save(os.path.join(FACIESMARK_DIR, 'test_inline_labels.npy'), test_labels)


    Shape : (599, 901, 255)
    Mean : 0.0017854898587952805
    Stdev : 0.2110130278058498
    Min : -1.0
    Median : 0.006975908663622733
    99th Percentile of Abs : 0.759497063116162
    Max : 1.0
    Null values : 0
    
    

    Dataset splits : 
    Train Vol : (479, 901, 255)
    Train_Labels : (479, 901, 255)

    Val Vol : (59, 901, 255)
    Val Labels : (59, 901, 255)

    Test Vol : (61, 901, 255)
    Test Labels : (61, 901, 255)
    


## STDATA-12 


In [9]:
vol_ST = np.load(STDATA_VOL_PATH_PREPROCESSED)
label_ST= np.load(STDATA_LABEL_PATH_PREPROCESSED)

summary_stats(vol_ST)

split_factor = [0.50,0.25,0.25]

train_dset, train_labels, val_dset, val_labels, test_dset, test_labels = split_dset(vol_ST, label_ST,split_factor)

STDATA_DIR = '../data/preprocessed/stdata12'

np.save(os.path.join(STDATA_DIR, 'train_inline_vol.npy'), train_dset)
np.save(os.path.join(STDATA_DIR, 'train_inline_labels.npy'), train_labels)

np.save(os.path.join(STDATA_DIR, 'val_inline_vol.npy'), val_dset)
np.save(os.path.join(STDATA_DIR, 'val_inline_labels.npy'), val_labels)

np.save(os.path.join(STDATA_DIR, 'test_inline_vol.npy'), test_dset)
np.save(os.path.join(STDATA_DIR, 'test_inline_labels.npy'), test_labels)


    Shape : (4, 951, 362)
    Mean : 0.0023937253281474113
    Stdev : 0.23796683549880981
    Min : -0.9788158535957336
    Median : 0.0
    99th Percentile of Abs : 0.5281193411350251
    Max : 1.0
    Null values : 0
    
    

    Dataset splits : 
    Train Vol : (2, 951, 362)
    Train_Labels : (2, 951, 362)

    Val Vol : (1, 951, 362)
    Val Labels : (1, 951, 362)

    Test Vol : (1, 951, 362)
    Test Labels : (1, 951, 362)
    
