# Data to Pandas table

**Purpose of script:**

Merge mw and opt data to one pandas dataframe, one row per pixel.

- In: opt and mw data (all files)
- Out: one parquet file with a plain dataframe of combined mw and opt values.

In [1]:
import xarray
import pandas as pd

from os import listdir
from os.path import isfile, join

Relevant Paths

In [2]:
mw_path = r"../Data/microwave-rs/mw_interpolated/"
opt_path = r"../Data/optical-rs/"

out_path = r"../Data/combined/dataframe_plain/"

Functions

In [3]:
def raster_to_pandas(data, mw_or_opt):
    # convert mw to pandas
    data = data.to_dataframe()
    # fix index
    data = data.reset_index()
    # remove columns: spacial_ref, band
    data = data[['x', 'y', 'band_data']]
    # rename
    if mw_or_opt in (['mw', 'microwave']):
        data.rename({'band_data': 'mw_value'}, axis=1, inplace=True)
    elif mw_or_opt in (['opt', 'optical']):
        data.rename({'band_data': 'opt_value'}, axis=1, inplace=True)
        # fill na for masked opt data, don't remember why needed
        data['opt_value'].fillna(-1, inplace=True)
    else:
        print('mw or opt?')
    return data

In [4]:
def main(mw_path, opt_path, out_path):
    # get files:
    mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
    opt_files = [f for f in listdir(opt_path) if isfile(join(opt_path, f))]

    for opt_file in opt_files: # opt files have missing data so vice versa would give errors
        melt_date =  opt_file[:10]
        for mw_file in mw_files:
            if mw_file.startswith(melt_date):
                print(melt_date)
                data_opt = xarray.open_dataarray(opt_path + opt_file)
                data_mw = xarray.open_dataarray(mw_path + mw_file)
                # to pandas:
                df_mw = raster_to_pandas(data_mw, 'mw')
                df_opt = raster_to_pandas(data_opt, 'opt')
                # merge the two dataframes:
                df_combined = pd.merge(df_mw, df_opt, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt
                # write to parquet:
                df_combined.to_parquet(out_path + 'melt_'+ melt_date + '.parquet.gzip', index= False)
    return
                

Main

In [5]:
main(mw_path, opt_path, out_path)

2019-06-08
2019-06-10
2019-06-11
2019-06-12
2019-06-13
2019-06-14
2019-06-15
2019-06-16
2019-06-17
2019-06-18
2019-06-19
2019-06-20
2019-06-21
2019-06-22
2019-06-23
2019-06-24
2019-06-25
2019-06-26
2019-06-27
2019-06-28
2019-06-29
2019-06-30
2019-07-01
2019-07-02
2019-07-03
2019-07-04
2019-07-05
2019-07-06
2019-07-07
2019-07-08
2019-07-09
2019-07-10
2019-07-11
2019-07-12
2019-07-13
2019-07-14
2019-07-15
2019-07-16
2019-07-17
2019-07-18
2019-07-19
2019-07-20
2019-07-21
2019-07-22
2019-07-23
2019-07-24
2019-07-25
2019-07-26
2019-07-27
2019-07-28
2019-07-29
2019-07-30
2019-07-31
