# Data to Pandas table

**Purpose of script:**

Merge mw and opt data to one pandas dataframe, one row per pixel.

- In: opt and mw data (all files)
- Out: one parquet file with a plain dataframe of combined mw and opt values.

In [1]:
import xarray
import pandas as pd

from os import listdir
from os.path import isfile, join

Relevant Paths

In [2]:
mw_path = r"../Data/microwave-rs/mw_interpolated/"
opt_path = r"../Data/optical-rs/unzipped/"

out_path = r"../Data/combined/dataframe_plain/"

Functions

In [3]:
def raster_to_pandas(data, mw_or_opt):
    # convert mw to pandas
    data = data.to_dataframe()
    # fix index
    data = data.reset_index()
    # remove columns: spacial_ref, band
    data = data[['x', 'y', 'band_data']]
    # rename
    if mw_or_opt in (['mw', 'microwave']):
        data.rename({'band_data': 'mw_value'}, axis=1, inplace=True)
    elif mw_or_opt in (['opt', 'optical']):
        data.rename({'band_data': 'opt_value'}, axis=1, inplace=True)
        # fill na for masked opt data, don't remember why needed
        data['opt_value'].fillna(-1, inplace=True)
    else:
        print('mw or opt?')
    return data

In [4]:
def main(mw_path, opt_path, out_path):
    # get files:
    mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
    opt_files = [f for f in listdir(opt_path) if isfile(join(opt_path, f))]

    for opt_file in opt_files: # opt files have missing data so vice versa would give errors
        melt_date =  opt_file[:10]
        for mw_file in mw_files:
            if mw_file.startswith(melt_date):
                print(melt_date)
                data_opt = xarray.open_dataarray(opt_path + opt_file)
                data_mw = xarray.open_dataarray(mw_path + mw_file)
                # to pandas:
                df_mw = raster_to_pandas(data_mw, 'mw')
                df_opt = raster_to_pandas(data_opt, 'opt')
                # merge the two dataframes:
                df_combined = pd.merge(df_mw, df_opt, how = 'left', on = ['y', 'x']) # left smaller mw, right - opt
                # write to parquet:
                df_combined.to_parquet(out_path + 'melt_'+ melt_date + '.parquet.gzip', index= False)
    return
                

In [5]:
mw_files = [f for f in listdir(mw_path) if isfile(join(mw_path, f))]
opt_files = [f for f in listdir(opt_path) if isfile(join(opt_path, f))]

Main

In [7]:
main(mw_path, opt_path, out_path)

2017-05-01
2017-05-02
2017-05-03
2017-05-04
2017-05-05
2017-05-06
2017-05-07
2017-05-08
2017-05-09
2017-05-10
2017-05-11
2017-05-12
2017-05-13
2017-05-14
2017-05-15
2017-05-16
2017-05-17
2017-05-18
2017-05-19
2017-05-20
2017-05-21
2017-05-22
2017-05-23
2017-05-24
2017-05-25
2017-05-26
2017-05-27
2017-05-28
2017-05-29
2017-05-30
2017-05-31
2017-06-01
2017-06-02
2017-06-03
2017-06-04
2017-06-05
2017-06-06
2017-06-08
2017-06-09
2017-06-10
2017-06-11
2017-06-12
2017-06-13
2017-06-14
2017-06-15
2017-06-16
2017-06-17
2017-06-18
2017-06-19
2017-06-20
2017-06-21
2017-06-22
2017-06-23
2017-06-24
2017-06-25
2017-06-26
2017-06-27
2017-06-28
2017-06-29
2017-06-30
2017-07-01
2017-07-02
2017-07-03
2017-07-04
2017-07-05
2017-07-06
2017-07-07
2017-07-08
2017-07-09
2017-07-10
2017-07-11
2017-07-12
2017-07-13
2017-07-14
2017-07-15
2017-07-16
2017-07-17
2017-07-18
2017-07-19
2017-07-20
2017-07-21
2017-07-22
2017-07-23
2017-07-24
2017-07-25
2017-07-26
2017-07-27
2017-07-28
2017-07-29
2017-07-30
2017-07-31