# Process WFABBA, Merge Files

<b>Summary:</b><br>
Reads in raw WFABBA GOES-16 & WFABBA GOES-17 data from individual text files and outputs as combined csv files<br>

- Parse WFABBA data from individual text files
- Merge WFABBA data into unified csv files (based off of time and satellite name)

<b>Output:</b><br>
../..<br>
└── data<br>
&emsp;&emsp;&emsp;└── processed<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-16-2019.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-17-2019.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-16-2020.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-17-2020.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-16-2021.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-17-2021.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-16-Jan-2021.csv<br>
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;└── GOES-17-Jan-2021.csv<br>

<b>Instructions:</b><br>
- See README.md in directory

<b>Areas for Improvement:</b><br>
Current processing time is about 3160 seconds. Need to consider faster processing methods.

In [6]:
import numpy as np
import pandas as pd
import datetime as dt
import pathlib
import os
import time
from tqdm import tqdm
# from multiprocessing import Pool

In [7]:
metadataColNames = ["Algorithm", "Version", "Timestamp", "Satellite", "Instrument", "FlightModel", "ScanMode", "ProductType", "FileName", "DataSource",
               "DataCreationTimestamp", "NavProjSubPtLong", "ActualSatSubPtLong", "NumFire", "MissingValueCode"]

dataColNames = ['Latitude', 'Longitude', 'Code', 'FRP', 'Fire Size', 'Fire Temp', 'Line', 'Element', 'Pixel Size',  'Obs BT4', 'Obs BT11', 'Bkg BT4', 'Bkg BT11', 'SolZen',  'SatZen', 'RelAzi', 'Eco']

allColNames = metadataColNames + dataColNames
rawDir = "../../data/raw/wfabba/"
processedDir = "../../data/processed/wfabba/"

In [8]:
starttime = time.time()
wfabba_dates = ["GOES-16-2019", "GOES-17-2019", "GOES-16-2020", "GOES-17-2020", 
    "GOES-16-2021", "GOES-17-2021", "GOES-16-Jan-2021", "GOES-17-Jan-2021", "GOES-16-2022", "GOES-17-2022"]

for wfabba_date in wfabba_dates:
    masterArr = []
    print(wfabba_date)

    targetDir = "../../data/raw/wfabba/" + wfabba_date

    # recursively visit every directory of raw data
    for root, subdirs, files in tqdm(sorted(os.walk(targetDir))):
        # print(root)
        # print(len(masterArr))

        #if there are any files in the directory to parse
        if len(files) > 0:
            # process files with .GOES-16, .GOES-17, .GOES-16.txt, or .GOES-17.txt
            if files[0][-8:] == ".GOES-16" or files[0][-8:] == ".GOES-17" or files[0][-12:] == ".GOES-16.txt" or files[0][-12:] == ".GOES-17.txt" :
                #loop through each file
                for file in files:
                    filepath = root+"/"+file
                    
                    fo = open(filepath ,'r')
                    metadataDict = {}
                    dataArr = []
                    lineCount = 0
                    
                    try:
                        # read every line in the file
                        for line in fo:
                            
                            # read in metadata
                            if line[0:4] == "### ":
                                # logic for first 11 lines
                                if lineCount < 11:
                                    parseString = line[4:]
                                    parseString = parseString.strip()
                                    
                                    # timestamp parsing logic
                                    if parseString[0:4] == 'Date':
                                        if parseString == "Date: , Time:  UTC":
                                            break
                                        year = parseString[6:10]
                                        day_num = parseString[10:13]
                                        hr = parseString[21:23]
                                        min = parseString[24:26]
                                        sec = parseString[27:29]
                                        
                                        #break out of loop if the timestamp data is all 0's
                                        if year == "0000" and day_num == "000" and hr == "00" and min == "00" and sec == "00":
                                            break
                                        
                                        # create timestamp from time data
                                        res = dt.datetime.strptime(year + "-" + day_num + " " + hr + ":" + min + ":" + sec, "%Y-%j %H:%M:%S")
                                        metadataDict["Timestamp"] = res
                                    else:
                                        # parse column headers for data
                                        splitData = parseString.split(", ")
                                        for item in splitData:
                                            if item[-9:] == 'Algorithm':
                                                metadataDict["Algorithm"] = item
                                            else:
                                                sections = item.split(":")

                                                if sections[0] == 'Flight Model':
                                                    keyname = 'FlightModel'
                                                elif sections[0] == 'Scan Mode':
                                                    keyname = 'ScanMode'
                                                elif sections[0] == 'Product type':
                                                    keyname = 'ProductType'
                                                elif sections[0] == 'Product/L2 filename':
                                                    keyname = 'FileName'
                                                elif sections[0] == 'Data source':
                                                    keyname = 'DataSource'
                                                elif sections[0] == 'Data creation time stamp':
                                                    keyname = 'DataCreationTimestamp'
                                                elif sections[0] == 'Navigation projection subpoint longitude':
                                                    keyname = 'NavProjSubPtLong'
                                                elif sections[0] == 'Actual satellite subpoint longitude':
                                                    keyname = 'ActualSatSubPtLong'
                                                elif sections[0] == 'Number of detected fires':
                                                    keyname = 'NumFire'
                                                elif sections[0] == 'Missing value code':
                                                    keyname = 'MissingValueCode'
                                                else:
                                                    keyname = sections[0]

                                                if len(sections) > 1:
                                                    if sections[1] == '':
                                                        value = None
                                                    else:
                                                        value = sections[1].strip()
                                                    metadataDict[keyname] = value
                                    lineCount+=1
                            # parse the actual data
                            elif line[0:3] != '###':
                                line_list = line.replace(' ', '').replace("\n","").split(',')
                                app_dict = {dataColNames[i]: line_list[i] for i in range(len(dataColNames))}

                                #keep 32 <= latitudes <= 35 and -123 <= longitude <= -113
                                if (float(app_dict["Latitude"]) >= 32) and (float(app_dict["Latitude"]) <= 35) and (float(app_dict["Longitude"]) >= -123) and (float(app_dict["Longitude"]) <= -113):
                                    dataArr.append(app_dict)
                                    
                    # error handling: print out the name of the file that is causing issues and break out of loop
                    except:
                        print("Error at " + filepath)
                        break
                        
                    # combine metadata and data into master array
                    for dataDict in dataArr:
                        allDataDict = {}
                        allDataDict.update(metadataDict)
                        allDataDict.update(dataDict)
                        masterArr.append(allDataDict)
                        
                    fo.close()
    
    #create dataframe out of master array
    df = pd.DataFrame(masterArr, columns=allColNames)
    endtime = time.time()
    print(endtime-starttime)
    df.to_csv(processedDir + wfabba_date + ".csv")
    print("==========================")

GOES-16-2019


100%|██████████| 214/214 [41:30<00:00, 11.64s/it]


2496.2598598003387
GOES-17-2019


100%|██████████| 214/214 [41:15<00:00, 11.57s/it]


4981.038531780243
GOES-16-2020


100%|██████████| 364/364 [1:11:55<00:00, 11.86s/it]


9311.159327507019
GOES-17-2020


  1%|          | 2/364 [00:05<17:36,  2.92s/it]

Error at ../../data/raw/wfabba/GOES-17-2020\2020_01_01_001/GOES-17_SSEC2020run_6.5.012g.tar.gz


100%|██████████| 364/364 [1:08:41<00:00, 11.32s/it]


13447.257733106613
GOES-16-2021


100%|██████████| 343/343 [02:13<00:00,  2.57it/s]


13588.878423213959
GOES-17-2021


100%|██████████| 339/339 [11:05<00:00,  1.96s/it]


14259.664109230042
GOES-16-Jan-2021


100%|██████████| 32/32 [05:37<00:00, 10.55s/it]


14602.111042022705
GOES-17-Jan-2021


100%|██████████| 32/32 [05:09<00:00,  9.66s/it]


14912.220679283142


  0%|          | 0/165 [00:00<?, ?it/s]

GOES-16-2022


100%|██████████| 165/165 [00:42<00:00,  3.90it/s]


14955.231473445892
GOES-17-2022


100%|██████████| 125/125 [01:46<00:00,  1.17it/s]


15062.290337562561
