In [1]:
import pandas as pd
import xarray as xr
import os

# QA Check

In [4]:
# CRYSTAL_FACE_NASA
# Check how many files match with env data 

# compare png files to filenames in merged_env
env_file = '/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/CRYSTAL_FACE_NASA.csv'
png_dir = '/home/vanessa/hulk/cocpit/cpi_data/campaigns/CRYSTAL_FACE_NASA/single_imgs_v1.4.0'
wrf_filelist = '/home/jko/ssl-cpi-analysis/CRYSTAL_FACE_NASA_filelist.txt'
df_env_nasa = pd.read_csv(env_file)
# get list of all files in png_dir
filenames_nasa = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
# get matching files and check how many
matches_nasa = df_env_nasa['filename'][df_env_nasa['filename'].isin(filenames_nasa)].tolist()
# narrow down to those matching WRF simulation day (2002-07-28)
with open(wrf_filelist, 'r') as f:
    filenames_wrf = [line.strip() for line in f if line.strip()]
matches_wrf_nasa = list(set(matches_nasa) & set(filenames_wrf))
# Summary of files 
print('File summary: CRYSTAL_FACE_NASA campaign')
print(f'# files in png dir: {len(filenames_nasa)}')
print(f'# files in environmental dataframe: {len(df_env_nasa)}')
print(f'# files that match between the two: {len(matches_nasa)}')
print(f'# matching files within WRF period: {len(matches_wrf_nasa)}')

File summary: CRYSTAL_FACE_NASA campaign
# files in png dir: 78152
# files in environmental dataframe: 61858
# files that match between the two: 61858
# matching files within WRF period: 13886


In [5]:
# CRYSTAL_FACE_UND
# Check how many files match with env data 

# compare png files to filenames in merged_env
env_file = '/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/CRYSTAL_FACE_UND.csv'
png_dir = '/home/vanessa/hulk/cocpit/cpi_data/campaigns/CRYSTAL_FACE_UND/single_imgs_v1.4.0'
wrf_filelist = '/home/jko/ssl-cpi-analysis/CRYSTAL_FACE_UND_filelist.txt'
df_env_und = pd.read_csv(env_file)
# get list of all files in png_dir
filenames_und = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
# get matching files and check how many
matches_und = df_env_und['filename'][df_env_und['filename'].isin(filenames_und)].tolist()
# narrow down to those matching WRF simulation day (2002-07-28)
with open(wrf_filelist, 'r') as f:
    filenames_wrf = [line.strip() for line in f if line.strip()]
matches_wrf_und = list(set(matches_und) & set(filenames_wrf))
# Summary of files 
print('File summary: CRYSTAL_FACE_UND campaign')
print(f'# files in png dir: {len(filenames_und)}')
print(f'# files in environmental dataframe: {len(df_env_und)}')
print(f'# files that match between the two: {len(matches_und)}')
print(f'# matching files within WRF period: {len(matches_wrf_und)}')

File summary: CRYSTAL_FACE_UND campaign
# files in png dir: 1617826
# files in environmental dataframe: 396138
# files that match between the two: 396138
# matching files within WRF period: 10907


# Check Habit Distributions

In [2]:
# Combine filelist and environmental data from both campaigns
campaign_list = ['CRYSTAL_FACE_NASA', 'CRYSTAL_FACE_UND']
df_list = [] 
for campaign in campaign_list:
    env_file = f'/glade/derecho/scratch/joko/cpi/env/{campaign}_env.csv'
    png_dir = f'/glade/derecho/scratch/joko/cpi/{campaign}'
    df_env = pd.read_csv(env_file)
    # get list of all files in png_dir
    filenames = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
    # get matching files and check how many
    matches = df_env['filename'][df_env['filename'].isin(filenames)].tolist()
    # print summary
    print(f'File summary: {campaign} campaign')
    print(f'# files in png dir: {len(filenames)}')
    print(f'# files in environmental dataframe: {len(df_env)}')
    print(f'# files that match between the two: {len(matches)}')
    # add to df list
    df_list.append(df_env[df_env['filename'].isin(matches)])
# Combine all dataframes into one
df_combined = pd.concat(df_list, ignore_index=True)

File summary: CRYSTAL_FACE_NASA campaign
# files in png dir: 17838
# files in environmental dataframe: 61858
# files that match between the two: 0
File summary: CRYSTAL_FACE_UND campaign
# files in png dir: 21884
# files in environmental dataframe: 396138
# files that match between the two: 7572


In [3]:
print(df_combined.shape)
df_combined.head()

(7572, 8)


Unnamed: 0,filename,date,latitude,longitude,altitude,pressure,temp,iwc
0,2002_0728_195434_835_3.png,2002-07-28 19:54:34,25.233821,-81.374428,8276.779297,364.442395,-22.947568,0.473688
1,2002_0728_195436_99_31.png,2002-07-28 19:54:36,25.235585,-81.375548,8283.89668,364.079346,-22.978956,0.652124
2,2002_0728_195436_99_33.png,2002-07-28 19:54:36,25.235585,-81.375548,8283.89668,364.079346,-22.978956,0.652124
3,2002_0728_195436_287_4.png,2002-07-28 19:54:36,25.235585,-81.375548,8283.89668,364.079346,-22.978956,0.652124
4,2002_0728_195436_99_36.png,2002-07-28 19:54:36,25.235585,-81.375548,8283.89668,364.079346,-22.978956,0.652124
