# Load, preprocess, and save train and test data

This notebook preprocesses and collates the training and testing data for model creation.

# John Brandt
# July 11, 2021

- Fuse Sentinel 1/2 data
- Reconstruct 2D-array from CEO output CSV by plot
- Match sentinel data to CEO labels
- Stack data_x, data_y, length
- Save arrays for data_x, data_y, length


# Package imports and source code

In [1]:
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
from random import shuffle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import os
import random
import itertools
from scipy.ndimage import median_filter
import hickle as hkl

os.environ['KMP_DUPLICATE_LIB_OK']='True'

%run ../src/preprocessing/slope.py

In [2]:
def reconstruct_images(plot_id):
    '''Takes a plot ID and subsets the input pd.DataFrame to that plot ID
       returns a (14, 14) array-like list with binary labels
       
        Parameters:
          batch_ids (list):
          batch_size (int):
          
         Returns:
          x_batch (arr):
          y_batch (arr):
    '''
    subs = df[df['PLOT_ID'] == plot_id]
    rows = []
    lats = reversed(sorted(subs['LAT'].unique()))
    for i, val in enumerate(lats):
        subs_lat = subs[subs['LAT'] == val]
        subs_lat = subs_lat.sort_values('LON', axis = 0)
        rows.append(list(subs_lat['TREE']))
    return rows

In [7]:
source = 'test'
sentinel_1 = True
s2_path = "../data/{}-s2/".format(source)
s1_path = "../data/{}-s1/".format(source)
csv_path = "../data/{}-csv/retest/".format(source)
output_path = "../data/{}-processed/".format(source)
dem_path = "../data/{}-dem/".format(source)

#s2_path = '../data/plantation/train-s2/'
#s1_path = '../data/plantation/train-s1/'
#dem_path = '../data/plantation/train-dem/'
#csv_path = "../data/plantation/train-csv/new/"

In [8]:
# Load and edit bad plot ids if needed
verified_lu_change = np.load("bad_plot_ids.npy")
len(verified_lu_change)

to_add = [141238348]
to_add = [x for x in to_add if x not in verified_lu_change]
verified_lu_change = np.concatenate([verified_lu_change, 
                     np.array(to_add).flatten()])

to_remove = []

verified_lu_change = [x for x in verified_lu_change if x not in to_remove]
np.save("bad_plot_ids.npy", np.array(verified_lu_change))
print(len(verified_lu_change))

2328


In [9]:
bad_test_plots =[10048, 10052, 10084, 20026, 20047, 20079, 20091, 100111, 100120, 100191, 100209, 100213, 100216, 
200101, 139190217, 139270445, 150027, 150051, 150057, 200187, 1500180, 136776649, 136776650,139190100,
139190109, 139190113, 139190268, 139190330, 139190396, 139190452, 139190506, 139190534, 139190803,
139190811, 139190892, 139190900, 139190903, 139190954, 139191025, 139191125, 139191502, 139191557,139191574,
       139252935, 139264527, 139264598, 139270017, 139270025, 139270222, 139270102, 139270307, 139270436, 
       139270494, 139270542]

In [13]:
# For either train or test data, loop through each plot and determine whether there is
# labelled Y data for it -- returning one dataframe for the entire data set
import re 

cols_to_keep = ['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'ANALYSES', 'USER_ID',
       'COLLECTION_TIME', 'ANALYSIS_DURATION', 'TREE', 'plotid', 'sampleid', 'PLOT_FNAME', 'PLANTATION']
csvs = [x for x in sorted(os.listdir(csv_path)) if ".csv" in x]
#csvs = [x for x in csvs if 'uuid'in x]
csvs = [x for x in csvs if ".csv" in x]
#csvs = [x for x in csvs if "chaco" in x]
#csvs = [x for x in csvs if "senegal" not in x]

print(csvs)

dfs = []
for i in csvs:
    df = pd.read_csv(csv_path + i, encoding = "ISO-8859-1")
    print(i, len(df) / 196)
    df.columns = [re.sub(r'\W+', '', x) for x in df.columns]
    df.rename(columns={'ïplotid':'plotid'}, inplace=True)
    df.columns = [x.upper() for x in df.columns]
    df.columns = ['PLOT_ID' if x == 'PLOTID' else x for x in df.columns]
    print(df.head(5))
    #df = df.drop('PLOT_ID', axis = 1)
    df.columns = ['SAMPLE_ID' if x == 'SAMPLEID' else x for x in df.columns]
    df = df.rename(columns={df.columns[0]: 'PLOT_ID'})
    #df = df[df['LAT'] > -24]
    #df = df[df['LAT'] < 24]
    #df = df.reset_index()
    #print(i, len(df) / 196)
    if len(df) > 0:
        #print(df.columns)
    # If there are no unique IDs already, go ahead and assign them
        
       # if abs(df['PLOT_ID'][0]) == 1:
        #    print(df['PLOT_ID'][0])
        #    print(f"No unique ID for {i}")
        #    for index, row in df.iterrows():
        #        row['PLOT_ID'] = abs(row['PLOT_ID'])
        #        df['PLOT_ID'][index] = str(i[-6:-4]).zfill(2) + '00' + str(row['PLOT_ID'])
        
        for column in df.columns:
            if column not in cols_to_keep:
                df = df.drop(column, axis = 1)
        #df.reset_index(inplace=True, drop=True)
        print(df.columns)
        #df['country'] = i.split(".")[0]
        #df.to_csv(csv_path + i, index = False)
        dfs.append(df)

df = pd.concat(dfs, ignore_index = True, sort = True)
print(len(df) // 196)
df = df[~pd.isna(df['TREE'])]
print(len(df) // 196)

plot_ids = sorted(df['PLOT_ID'].unique())
plot_ids_loaded = plot_ids

print(f"There are {len(plot_ids)} plots")

['ceo-final-fix-sample-data-2021-08-06.csv', 'ceo-global-uuid-0.csv', 'ceo-global-uuid-1.csv', 'ceo-global-uuid-12.csv', 'ceo-global-uuid-2.csv', 'ceo-global-uuid-3.csv', 'ceo-global-uuid-4.csv', 'ceo-global-uuid-5.csv', 'ceo-global-uuid-6.csv', 'ceo-global-uuid-7.csv', 'ceo-global-uuid-8.csv', 'ceo-global-uuid-9.csv', 'ceo-kenya_shrubs-sample-data-2021-09-02.csv', 'ceo-kenya_shrubs2-sample-data-2021-09-14.csv', 'ceo-kenya_tree-sample-data-2021-09-02.csv', 'ceo-senegal-2-sample-data-2021-08-09.csv', 'ceo-senegal-train-sample-data-2021-08-09.csv', 'ceo-train-july-0.csv', 'ceo-train-july-1-sample-data-2021-07-22.csv', 'ceo-train-july-2-sample-data-2021-07-22.csv', 'ceo-train-july-3-sample-data-2021-07-22.csv']
ceo-final-fix-sample-data-2021-08-06.csv 63.0
     PLOT_ID  SAMPLE_ID        LON       LAT  FLAGGED   COLLECTION_TIME  \
0  141237861  571918843 -80.505604 -2.197160    False  2021-08-06 17:42   
1  141237861  571918844 -80.505604 -2.197071    False  2021-08-06 17:42   
2  14123786

ceo-global-uuid-9.csv 208.0
   PLOT_ID  SAMPLE_ID        LON        LAT FLAGGED   COLLECTION_TIME  \
0     9001          1  80.650456  35.858964   False  2022-01-13 18:06   
1     9001          2  80.650456  35.859036   False  2022-01-13 18:06   
2     9001          3  80.650456  35.859109   False  2022-01-13 18:06   
3     9001          4  80.650456  35.859182   False  2022-01-13 18:06   
4     9001          5  80.650456  35.859255   False  2022-01-13 18:06   

  ANALYSIS_DURATION  TREE            COUNTRY  
0         59.9 secs   0.0  ceo-global-uuid-9  
1         59.9 secs   0.0  ceo-global-uuid-9  
2         59.9 secs   0.0  ceo-global-uuid-9  
3         59.9 secs   0.0  ceo-global-uuid-9  
4         59.9 secs   0.0  ceo-global-uuid-9  
Index(['PLOT_ID', 'SAMPLE_ID', 'LON', 'LAT', 'FLAGGED', 'COLLECTION_TIME',
       'ANALYSIS_DURATION', 'TREE'],
      dtype='object')
ceo-kenya_shrubs-sample-data-2021-09-02.csv 177.0
     PLOT_ID  SAMPLE_ID        LON       LAT  FLAGGED   COLLECTION_

2825
There are 2826 plots


In [15]:
plot_ids[-100:]

[141512959,
 141512960,
 141512961,
 141512962,
 141512963,
 141512964,
 141512965,
 141512966,
 141512968,
 141512971,
 141512972,
 141512973,
 141512974,
 141512975,
 141512976,
 141512977,
 141512978,
 141512979,
 141512980,
 141512981,
 141512982,
 141512983,
 141512984,
 141512985,
 141512986,
 141512987,
 141512988,
 141512989,
 141512990,
 141512991,
 141512992,
 141512993,
 141512994,
 141512995,
 141512996,
 141512997,
 141512998,
 141512999,
 141513000,
 141513001,
 141513002,
 141513003,
 141513004,
 141513005,
 141513006,
 141513007,
 141513008,
 141513009,
 141513010,
 141513011,
 141513013,
 141513014,
 141513015,
 141513016,
 141513017,
 141513018,
 141513019,
 141513021,
 141513022,
 141513023,
 141513024,
 141513025,
 141513026,
 141513027,
 141513028,
 141513029,
 141513030,
 141513031,
 141513033,
 141513034,
 141513035,
 141513036,
 141513037,
 141513038,
 141513039,
 141513040,
 141513041,
 141513042,
 141513043,
 141513044,
 141513045,
 141513046,
 141513047,
 141

In [21]:
#df.to_csv("tml-india-train-plots.csv")

In [16]:
def to_int16(array: np.array) -> np.array:
    '''Converts a float32 array to int16, reducing storage costs by three-fold'''
    assert np.min(array) >= 0, np.min(array)
    assert np.max(array) <= 1, np.max(array)
    
    array = np.clip(array, 0, 1)
    array = np.trunc(array * 65535)
    assert np.min(array >= 0)
    assert np.max(array <= 65535)
    
    return array.astype(np.uint16)

def process_dem(dem):
    dem =  median_filter(dem, size = 5)
    dem = calcSlope(dem.reshape((1, 32+2, 32+2)),
                      np.full((32+2, 32+2), 10),
                      np.full((32+2, 32+2), 10), 
                      zScale = 1, minSlope = 0.02)
    dem = dem / 90
    dem = dem.reshape((32+2, 32+2, 1))
    dem = dem[1:-1, 1:-1]
    dem = median_filter(dem, 5)[2:-2, 2:-2]
    return dem

def grndvi(array):
    nir = np.clip(array[..., 3], 0, 1)
    green = np.clip(array[..., 1], 0, 1)
    red = np.clip(array[..., 2], 0, 1)
    denominator = (nir+(green+red)) + 1e-5
    return (nir-(green+red)) / denominator


In [17]:
from skimage.transform import resize

%run ../src/preprocessing/indices.py

def to_float32(array: np.array) -> np.array:
    """Converts an int_x array to float32"""
    if not isinstance(array.flat[0], np.floating):
        assert np.max(array) > 1
        array = np.float32(array) / 65535.
    assert np.max(array) <= 1
    assert array.dtype == np.float32
    return array

count = 0
dataframe = pd.DataFrame({'plot_id': [''], 'lat': [0.325], 'long': [0.325],
                          'y': [0]})

# Identify shape of data to load
#plot_ids = [str(x).zfill(5) for x in plot_ids]
plot_ids_to_load = []
for i in range(len(plot_ids)):
    s1_i = f'{s1_path}{str(plot_ids[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids[i])}.npy'
    s1_new_i = f'../data/{source}-s1/{str(plot_ids[i])}.npy'
    s1_exists = (os.path.exists(s1_i))
    print(s1_exists, os.path.isfile(s2_i), s2_i)
    
    if os.path.isfile(s2_i) and s1_exists:
        if plot_ids[i] not in bad_test_plots:#verified_lu_change:
            plot_ids_to_load.append(plot_ids[i])

print(f"There are {len(plot_ids_to_load)} plots")
plot_ids_to_load = [x for x in plot_ids_to_load if x not in  [139077414,
                                                              139187051,
                                                              139187043,
                                                             139187133, 139187134]]
data_x = np.zeros((len(plot_ids_to_load), 12, 28, 28, 14)).astype(np.uint16) # 14
data_y = np.zeros((len(plot_ids_to_load), 14, 14))
            
# Iterate over each plot
to_remove = []

for i in range(len(plot_ids_to_load)):
    #print(plot_ids_to_load[i])
    s1_i = f'{s1_path}{str(plot_ids_to_load[i])}.hkl'
    s2_i = f'{s2_path}{str(plot_ids_to_load[i])}.hkl'
    dem_i = f'{dem_path}{str(plot_ids_to_load[i])}.npy'

    x = to_float32(hkl.load(s2_i))
    s1 = hkl.load(s1_i)
    s1 = np.reshape(s1, (12, 16, 2, 16, 2, 2))
    s1 = np.mean(s1, axis = (2, 4))
    s1 = resize(s1, (12, 32, 32, 2), order = 1)
    s1 = s1[:, 2:-2, 2:-2, :]
    
    dem = np.load(dem_i)
    dem = process_dem(dem)
    dem = np.tile(dem.reshape((1, 28, 28)), (x.shape[0], 1, 1))
    x[..., 10] = dem
    x = np.concatenate([x, s1], axis = -1)

    count += 1
    y = reconstruct_images(plot_ids_to_load[i])
    long = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LON'])
    lat = np.mean(df[df['PLOT_ID'] == plot_ids_to_load[i]]['LAT'])
    dataframe = dataframe.append({'plot_id': str(plot_ids_to_load[i]),
                                  'lat': lat, 'long': long,
                                 'y': np.sum(np.array(y))}, 
                                 ignore_index = True)
    dataframe.append([plot_ids_to_load[i], lat, long])

    if np.sum(np.isnan(x)) > 0:
        to_remove.append(i)
    else:
        x = np.clip(x, 0, 1)
        x = to_int16(x)
        data_x[i] = x
        print("X worked")
        try:
            data_y[i] = np.array(y)
        except:
            print("Y didn't work")
           # to_remove.append(i)
            
# Remove any data samples that had missing values
if len(to_remove) > 0:
    print(f"Removing {to_remove}")
    #data_x = np.delete(data_x, to_remove, 0)
    #data_y = np.delete(data_y, to_remove, 0)
            
print(f"Finished loading: {data_x.shape} of {data_x.dtype} type")

True True ../data/train-s2/1.hkl
True True ../data/train-s2/2.hkl
True True ../data/train-s2/3.hkl
True True ../data/train-s2/4.hkl
True True ../data/train-s2/5.hkl
True True ../data/train-s2/6.hkl
True True ../data/train-s2/7.hkl
True True ../data/train-s2/8.hkl
True True ../data/train-s2/10.hkl
True True ../data/train-s2/12.hkl
True True ../data/train-s2/13.hkl
True True ../data/train-s2/14.hkl
True True ../data/train-s2/15.hkl
True True ../data/train-s2/16.hkl
True True ../data/train-s2/17.hkl
True True ../data/train-s2/18.hkl
True True ../data/train-s2/19.hkl
True True ../data/train-s2/20.hkl
True True ../data/train-s2/21.hkl
True True ../data/train-s2/22.hkl
True True ../data/train-s2/23.hkl
True True ../data/train-s2/24.hkl
True True ../data/train-s2/25.hkl
True True ../data/train-s2/26.hkl
True True ../data/train-s2/27.hkl
True True ../data/train-s2/28.hkl
True True ../data/train-s2/29.hkl
True True ../data/train-s2/30.hkl
True True ../data/train-s2/31.hkl
True True ../data/trai

True True ../data/train-s2/60016.hkl
True True ../data/train-s2/60017.hkl
True True ../data/train-s2/60018.hkl
True True ../data/train-s2/60019.hkl
True True ../data/train-s2/60020.hkl
True True ../data/train-s2/60021.hkl
True True ../data/train-s2/60022.hkl
True True ../data/train-s2/60023.hkl
True True ../data/train-s2/60024.hkl
True True ../data/train-s2/60026.hkl
True True ../data/train-s2/60027.hkl
True True ../data/train-s2/60028.hkl
True True ../data/train-s2/60029.hkl
True True ../data/train-s2/60031.hkl
True True ../data/train-s2/60032.hkl
True True ../data/train-s2/60034.hkl
True True ../data/train-s2/60035.hkl
True True ../data/train-s2/60036.hkl
True True ../data/train-s2/60037.hkl
True True ../data/train-s2/60038.hkl
True True ../data/train-s2/60039.hkl
True True ../data/train-s2/60040.hkl
True True ../data/train-s2/60041.hkl
True True ../data/train-s2/60042.hkl
True True ../data/train-s2/60043.hkl
True True ../data/train-s2/60044.hkl
True True ../data/train-s2/60045.hkl
T

True True ../data/train-s2/100124.hkl
True True ../data/train-s2/100125.hkl
True True ../data/train-s2/100126.hkl
True True ../data/train-s2/100127.hkl
True True ../data/train-s2/100128.hkl
True True ../data/train-s2/100129.hkl
True True ../data/train-s2/100130.hkl
True True ../data/train-s2/100131.hkl
True True ../data/train-s2/100132.hkl
True True ../data/train-s2/100133.hkl
True True ../data/train-s2/100134.hkl
True True ../data/train-s2/100135.hkl
True True ../data/train-s2/100136.hkl
True True ../data/train-s2/100137.hkl
True True ../data/train-s2/100138.hkl
True True ../data/train-s2/100139.hkl
True True ../data/train-s2/100140.hkl
True True ../data/train-s2/100141.hkl
True True ../data/train-s2/100142.hkl
True True ../data/train-s2/100143.hkl
True True ../data/train-s2/100144.hkl
True True ../data/train-s2/100145.hkl
True True ../data/train-s2/100146.hkl
True True ../data/train-s2/100147.hkl
True True ../data/train-s2/100148.hkl
True True ../data/train-s2/100149.hkl
True True ..

True True ../data/train-s2/200229.hkl
True True ../data/train-s2/200230.hkl
True True ../data/train-s2/200231.hkl
True True ../data/train-s2/200232.hkl
True True ../data/train-s2/200234.hkl
True True ../data/train-s2/200235.hkl
True True ../data/train-s2/200236.hkl
True True ../data/train-s2/200237.hkl
True True ../data/train-s2/200238.hkl
True True ../data/train-s2/200241.hkl
True True ../data/train-s2/200242.hkl
True True ../data/train-s2/200243.hkl
True True ../data/train-s2/200244.hkl
True True ../data/train-s2/200245.hkl
True True ../data/train-s2/300100.hkl
True True ../data/train-s2/300101.hkl
True True ../data/train-s2/300102.hkl
True True ../data/train-s2/300104.hkl
True True ../data/train-s2/300105.hkl
True True ../data/train-s2/300106.hkl
True True ../data/train-s2/300108.hkl
True True ../data/train-s2/300109.hkl
True True ../data/train-s2/300110.hkl
True True ../data/train-s2/300114.hkl
True True ../data/train-s2/300116.hkl
True True ../data/train-s2/300117.hkl
True True ..

True True ../data/train-s2/700113.hkl
True True ../data/train-s2/700114.hkl
True True ../data/train-s2/700115.hkl
True False ../data/train-s2/700116.hkl
True True ../data/train-s2/700117.hkl
True True ../data/train-s2/700119.hkl
True True ../data/train-s2/700120.hkl
True True ../data/train-s2/700121.hkl
True False ../data/train-s2/700122.hkl
True False ../data/train-s2/700123.hkl
True True ../data/train-s2/700124.hkl
True False ../data/train-s2/700125.hkl
True True ../data/train-s2/700126.hkl
True True ../data/train-s2/700127.hkl
True False ../data/train-s2/700128.hkl
True True ../data/train-s2/700129.hkl
True True ../data/train-s2/700130.hkl
True True ../data/train-s2/700131.hkl
True True ../data/train-s2/700132.hkl
True True ../data/train-s2/700133.hkl
True True ../data/train-s2/700135.hkl
True True ../data/train-s2/700137.hkl
True True ../data/train-s2/700139.hkl
True True ../data/train-s2/700141.hkl
True True ../data/train-s2/700142.hkl
True True ../data/train-s2/700144.hkl
True Tr

True True ../data/train-s2/900136.hkl
True True ../data/train-s2/900137.hkl
True True ../data/train-s2/900139.hkl
True True ../data/train-s2/900141.hkl
True True ../data/train-s2/900144.hkl
True True ../data/train-s2/900146.hkl
True True ../data/train-s2/900147.hkl
True True ../data/train-s2/900148.hkl
True True ../data/train-s2/900149.hkl
True True ../data/train-s2/900150.hkl
True True ../data/train-s2/900152.hkl
True True ../data/train-s2/900154.hkl
True True ../data/train-s2/900156.hkl
True True ../data/train-s2/900157.hkl
True True ../data/train-s2/900161.hkl
True True ../data/train-s2/900167.hkl
True True ../data/train-s2/900168.hkl
True True ../data/train-s2/900169.hkl
True True ../data/train-s2/900170.hkl
True True ../data/train-s2/900171.hkl
True True ../data/train-s2/900173.hkl
True False ../data/train-s2/900175.hkl
True True ../data/train-s2/900177.hkl
True False ../data/train-s2/900178.hkl
True False ../data/train-s2/900180.hkl
True True ../data/train-s2/900182.hkl
True Fals

True True ../data/train-s2/141018526.hkl
True True ../data/train-s2/141018527.hkl
True True ../data/train-s2/141018529.hkl
True True ../data/train-s2/141018531.hkl
True True ../data/train-s2/141018532.hkl
True True ../data/train-s2/141018535.hkl
True True ../data/train-s2/141018536.hkl
True True ../data/train-s2/141018537.hkl
True True ../data/train-s2/141018538.hkl
True True ../data/train-s2/141018539.hkl
True True ../data/train-s2/141018545.hkl
True True ../data/train-s2/141018546.hkl
True True ../data/train-s2/141018547.hkl
True True ../data/train-s2/141018548.hkl
True True ../data/train-s2/141018550.hkl
True True ../data/train-s2/141018553.hkl
True True ../data/train-s2/141018554.hkl
True True ../data/train-s2/141018555.hkl
True True ../data/train-s2/141018557.hkl
True True ../data/train-s2/141018558.hkl
True True ../data/train-s2/141018559.hkl
True True ../data/train-s2/141018562.hkl
True True ../data/train-s2/141018563.hkl
True True ../data/train-s2/141018564.hkl
True True ../dat

True True ../data/train-s2/141370633.hkl
True True ../data/train-s2/141370634.hkl
True True ../data/train-s2/141370635.hkl
True True ../data/train-s2/141370636.hkl
True True ../data/train-s2/141370637.hkl
True True ../data/train-s2/141370638.hkl
True True ../data/train-s2/141370639.hkl
True True ../data/train-s2/141370640.hkl
True True ../data/train-s2/141370641.hkl
True True ../data/train-s2/141370642.hkl
True True ../data/train-s2/141370643.hkl
True True ../data/train-s2/141370644.hkl
True True ../data/train-s2/141370645.hkl
True True ../data/train-s2/141370646.hkl
True True ../data/train-s2/141370647.hkl
True True ../data/train-s2/141370648.hkl
True True ../data/train-s2/141370649.hkl
True True ../data/train-s2/141370650.hkl
True True ../data/train-s2/141370651.hkl
True True ../data/train-s2/141370652.hkl
True True ../data/train-s2/141370653.hkl
True True ../data/train-s2/141370654.hkl
True True ../data/train-s2/141370655.hkl
True True ../data/train-s2/141370656.hkl
True True ../dat

X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X

X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X

X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X worked
X

In [18]:
import hickle as hkl
dataframe = dataframe.drop(0, 0)
dataframe.reset_index(inplace = True, drop = True)
if len(to_remove) > 0:
    dataframe = dataframe.drop(to_remove, 0)
    dataframe.reset_index(inplace = True, drop = True)

print(f"Writing {source} data")
hkl.dump(data_x, f"../data/{source}/{source}_x.hkl", mode='w', compression='gzip')
hkl.dump(data_y, f"../data/{source}/{source}_y.hkl", mode='w', compression='gzip')
dataframe.to_csv(f"../data/{source}/{source}_plot_ids.csv", index = False)
print("Finished!")