# Create csv of file list

In [1]:
from os import listdir, path, makedirs
import re
from datetime import datetime, timedelta, time
from glob import glob
import pandas as pd

### Image directory

In [2]:
original_dir = '/n/mickley/lab/HMS_vision/original/'
band1_dir = path.join(original_dir, 'band1')
band3_dir = path.join(original_dir, 'band3')
hms_dir = path.join(original_dir, 'HMS')
daynight_dir = path.join(original_dir, 'DayNight')

In [3]:
meteo_vars = ['CLDTOT', 'U10M', 'V10M', 'QV2M']
meteo_dirs = {}
for meteo_var in meteo_vars:
    meteo_dirs[meteo_var] = path.join(original_dir, 'meteo_var')

## Define patterns to extract timestamp from images
### GOES pattern

In [4]:
sample = path.basename(glob(path.join(band1_dir, "**", "*.png"), recursive=True)[0])
sample

'RadF-M3C01-s20180012000.png'

In [5]:
goes_pattern = re.compile(r"RadF-M3C0(?P<band>\d?)-s(?P<year>\d{4}?)(?P<day>\d{3}?)(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.png", re.VERBOSE)

In [6]:
def extract_GOES(s):
    match = goes_pattern.match(s)
    if match:
        year = int(match.group('year'))
        day = int(match.group('day'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return datetime(year, 1, 1) + timedelta(days=day - 1, hours=hour, minutes=minute)
    else:
        return None

In [7]:
extract_GOES(sample)

datetime.datetime(2018, 1, 1, 20, 0)

### HMS pattern

In [8]:
hms_sample = path.basename(glob(path.join(hms_dir, "**", "*.tif"), recursive=True)[0])
hms_sample

'HMS_Density_20180427_1900.tif'

In [9]:
hms_pattern = re.compile(r"HMS_Density_(?P<year>\d{4}?)(?P<month>\d{2}?)(?P<day>\d{2}?)_(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.tif", re.VERBOSE)

In [10]:
def extract_HMS(s):
    match = hms_pattern.match(s)
    if match:
        year = int(match.group('year'))
        month = int(match.group('month'))
        day = int(match.group('day'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return datetime(year=year, month=month, day=day, hour=hour, minute=minute)
    else:
        return ValueError()

In [11]:
extract_HMS(hms_sample)

datetime.datetime(2018, 4, 27, 19, 0)

### Meteo pattern

In [None]:
met_sample = path.basename(glob(path.join(hms_dir, "**", "*.tif"), recursive=True)[0])
hms_sample

### Daynight pattern

In [13]:
daynight_sample = sorted(listdir(daynight_dir))[0]
daynight_sample

'DayNight_001_0000.png'

In [14]:
daynight_pattern = re.compile(r"DayNight_(?P<yday>\d{3}?)_(?P<hour>\d{2}?)(?P<minute>\d{2}?)\.png", re.VERBOSE)

In [15]:
def extract_daynight(s):
    match = daynight_pattern.match(s)
    if match:
        yday = int(match.group('yday'))
        hour = int(match.group('hour'))
        minute = 0 if int(match.group('minute')) < 30 else 30
        return yday, hour, minute
    else:
        return ValueError()

In [16]:
extract_daynight(daynight_sample)

(1, 0, 0)

## Create csv for file list
### GOES

In [17]:
band1_path = path.join(band1_dir, "**", "*.png")
band3_path = path.join(band3_dir, "**", "*.png")

In [18]:
band1_path_list = filter(lambda s: goes_pattern.match(path.basename(s)), glob(band1_path, recursive=True))
band3_path_list = filter(lambda s: goes_pattern.match(path.basename(s)), glob(band3_path, recursive=True))

In [19]:
band1_df = pd.concat([pd.DataFrame([file_path],
                                   columns=['path_band1']) 
                      for file_path in band1_path_list],
                     ignore_index=True)
band3_df = pd.concat([pd.DataFrame([file_path],
                                   columns=['path_band3']) 
                      for file_path in band3_path_list],
                     ignore_index=True)

In [20]:
band1_df['timestamp'] = band1_df['path_band1'].apply(lambda filepath: extract_GOES(path.basename(filepath)))
band1_df = band1_df.set_index('timestamp').sort_index()

In [21]:
band1_df.head()

Unnamed: 0_level_0,path_band1
timestamp,Unnamed: 1_level_1
2018-01-01 20:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 20:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 22:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...


In [22]:
band3_df['timestamp'] = band3_df['path_band3'].apply(lambda filepath: extract_GOES(path.basename(filepath)))
band3_df = band3_df.set_index('timestamp').sort_index()

In [23]:
band3_df.head()

Unnamed: 0_level_0,path_band3
timestamp,Unnamed: 1_level_1
2018-01-01 20:00:00,/n/mickley/lab/HMS_vision/original/band3/RadF-...
2018-01-01 20:30:00,/n/mickley/lab/HMS_vision/original/band3/RadF-...
2018-01-01 21:00:00,/n/mickley/lab/HMS_vision/original/band3/RadF-...
2018-01-01 21:30:00,/n/mickley/lab/HMS_vision/original/band3/RadF-...
2018-01-01 22:00:00,/n/mickley/lab/HMS_vision/original/band3/RadF-...


### HMS

In [24]:
hms_path = path.join(hms_dir, "**", "*.tif")

In [25]:
hms_path_list = filter(lambda s: hms_pattern.match(path.basename(s)), glob(hms_path, recursive=True))

In [26]:
hms_df = pd.concat([pd.DataFrame([file_path],
                                 columns=['path_hms']) 
                    for file_path in hms_path_list],
                   ignore_index=True)

In [27]:
hms_df['timestamp'] = hms_df['path_hms'].apply(lambda filepath: extract_HMS(path.basename(filepath)))
hms_df = hms_df.set_index('timestamp').sort_index()

In [28]:
hms_df.head()

Unnamed: 0_level_0,path_hms
timestamp,Unnamed: 1_level_1
2018-01-01 20:30:00,/n/mickley/lab/HMS_vision/original/HMS/HMS_Den...
2018-01-01 21:00:00,/n/mickley/lab/HMS_vision/original/HMS/HMS_Den...
2018-01-01 21:30:00,/n/mickley/lab/HMS_vision/original/HMS/HMS_Den...
2018-01-01 22:00:00,/n/mickley/lab/HMS_vision/original/HMS/HMS_Den...
2018-01-01 22:30:00,/n/mickley/lab/HMS_vision/original/HMS/HMS_Den...


### Daynight

In [29]:
daynight_path = path.join(daynight_dir, "**", "*.png")

In [30]:
daynight_path_list = filter(lambda s: daynight_pattern.match(path.basename(s)), glob(daynight_path, recursive=True))

In [31]:
daynight_df = pd.concat([pd.DataFrame([file_path],
                                 columns=['path_daynight']) 
                    for file_path in daynight_path_list],
                   ignore_index=True)

In [32]:
def extract_daynight_timestamp(daynight_filename):
    yday, hour, minute = extract_daynight(path.basename(daynight_filename))
    return pd.Series({'yday':yday, 'hour':hour, 'minute': minute})
daynight_df = daynight_df.merge(daynight_df.path_daynight.apply(extract_daynight_timestamp), left_index=True, right_index=True)

In [33]:
daynight_df = daynight_df.set_index(['yday', 'hour', 'minute']).sort_index()

In [34]:
daynight_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,path_daynight
yday,hour,minute,Unnamed: 3_level_1
1,0,0,/n/mickley/lab/HMS_vision/original/DayNight/Da...
1,0,30,/n/mickley/lab/HMS_vision/original/DayNight/Da...
1,1,0,/n/mickley/lab/HMS_vision/original/DayNight/Da...
1,1,30,/n/mickley/lab/HMS_vision/original/DayNight/Da...
1,2,0,/n/mickley/lab/HMS_vision/original/DayNight/Da...
...,...,...,...
366,21,30,/n/mickley/lab/HMS_vision/original/DayNight/Da...
366,22,0,/n/mickley/lab/HMS_vision/original/DayNight/Da...
366,22,30,/n/mickley/lab/HMS_vision/original/DayNight/Da...
366,23,0,/n/mickley/lab/HMS_vision/original/DayNight/Da...


## Remove duplicates
### GOES

In [35]:
band1_df_mod = band1_df.reset_index()
duplicated_index = band1_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

1


In [36]:
duplicated_elements = band1_df_mod[band1_df_mod.timestamp == band1_df_mod[duplicated_index].timestamp.iat[0]]
print(duplicated_elements)
print(duplicated_elements.values)

               timestamp                                         path_band1
5734 2018-09-17 15:00:00  /n/mickley/lab/HMS_vision/original/band1/RadF-...
5735 2018-09-17 15:00:00  /n/mickley/lab/HMS_vision/original/band1/RadF-...
[[Timestamp('2018-09-17 15:00:00')
  '/n/mickley/lab/HMS_vision/original/band1/RadF-M3C01-s20182601500.png']
 [Timestamp('2018-09-17 15:00:00')
  '/n/mickley/lab/HMS_vision/original/band1/RadF-M3C01-s20182601515.png']]


In [37]:
remove_idx = duplicated_elements.iloc[0].name
band1_df_mod = band1_df_mod.drop(index=remove_idx)
band1_df = band1_df_mod.set_index('timestamp')

In [38]:
band1_df

Unnamed: 0_level_0,path_band1
timestamp,Unnamed: 1_level_1
2018-01-01 20:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 20:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 21:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-01-01 22:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
...,...
2018-12-29 23:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 00:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 00:30:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...
2018-12-30 01:00:00,/n/mickley/lab/HMS_vision/original/band1/RadF-...


In [39]:
band3_df_mod = band3_df.reset_index()
duplicated_index = band3_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

0


### HMS

In [40]:
hms_df_mod = hms_df.reset_index()
duplicated_index = hms_df_mod.timestamp.duplicated()
print(duplicated_index.sum())

0


### Save csv

In [41]:
band1_df.to_csv('data_csv/band1_all.csv')
band3_df.to_csv('data_csv/band3_all.csv')
hms_df.to_csv('data_csv/hms_all.csv')
daynight_df.to_csv('data_csv/daynight_all.csv')