In [1]:
import pandas as pd
import xarray as xr
import os

# QA Check

In [4]:
# CRYSTAL_FACE_NASA
# Check how many files match with env data 

# compare png files to filenames in merged_env
env_file = '/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/CRYSTAL_FACE_NASA.csv'
png_dir = '/home/vanessa/hulk/cocpit/cpi_data/campaigns/CRYSTAL_FACE_NASA/single_imgs_v1.4.0'
wrf_filelist = '/home/jko/ssl-cpi-analysis/CRYSTAL_FACE_NASA_filelist.txt'
df_env_nasa = pd.read_csv(env_file)
# get list of all files in png_dir
filenames_nasa = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
# get matching files and check how many
matches_nasa = df_env_nasa['filename'][df_env_nasa['filename'].isin(filenames_nasa)].tolist()
# narrow down to those matching WRF simulation day (2002-07-28)
with open(wrf_filelist, 'r') as f:
    filenames_wrf = [line.strip() for line in f if line.strip()]
matches_wrf_nasa = list(set(matches_nasa) & set(filenames_wrf))
# Summary of files 
print('File summary: CRYSTAL_FACE_NASA campaign')
print(f'# files in png dir: {len(filenames_nasa)}')
print(f'# files in environmental dataframe: {len(df_env_nasa)}')
print(f'# files that match between the two: {len(matches_nasa)}')
print(f'# matching files within WRF period: {len(matches_wrf_nasa)}')

File summary: CRYSTAL_FACE_NASA campaign
# files in png dir: 78152
# files in environmental dataframe: 61858
# files that match between the two: 61858
# matching files within WRF period: 13886


In [5]:
# CRYSTAL_FACE_UND
# Check how many files match with env data 

# compare png files to filenames in merged_env
env_file = '/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/CRYSTAL_FACE_UND.csv'
png_dir = '/home/vanessa/hulk/cocpit/cpi_data/campaigns/CRYSTAL_FACE_UND/single_imgs_v1.4.0'
wrf_filelist = '/home/jko/ssl-cpi-analysis/CRYSTAL_FACE_UND_filelist.txt'
df_env_und = pd.read_csv(env_file)
# get list of all files in png_dir
filenames_und = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
# get matching files and check how many
matches_und = df_env_und['filename'][df_env_und['filename'].isin(filenames_und)].tolist()
# narrow down to those matching WRF simulation day (2002-07-28)
with open(wrf_filelist, 'r') as f:
    filenames_wrf = [line.strip() for line in f if line.strip()]
matches_wrf_und = list(set(matches_und) & set(filenames_wrf))
# Summary of files 
print('File summary: CRYSTAL_FACE_UND campaign')
print(f'# files in png dir: {len(filenames_und)}')
print(f'# files in environmental dataframe: {len(df_env_und)}')
print(f'# files that match between the two: {len(matches_und)}')
print(f'# matching files within WRF period: {len(matches_wrf_und)}')

File summary: CRYSTAL_FACE_UND campaign
# files in png dir: 1617826
# files in environmental dataframe: 396138
# files that match between the two: 396138
# matching files within WRF period: 10907


# Check Habit Distributions

In [None]:
env_file = '/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/CRYSTAL_FACE_NASA.csv'
png_dir = '/home/vanessa/hulk/cocpit/cpi_data/campaigns/CRYSTAL_FACE_NASA/single_imgs_v1.4.0'

In [4]:
# Combine filelist and environmental data from both campaigns
campaign_list = ['CRYSTAL_FACE_NASA', 'CRYSTAL_FACE_UND']
df_list = [] 
for campaign in campaign_list:
    env_file = f'/home/vanessa/hulk/cocpit/final_databases/vgg16/v1.4.0/merged_env/{campaign}.csv'
    png_dir = f'/home/vanessa/hulk/cocpit/cpi_data/campaigns/{campaign}/single_imgs_v1.4.0'
    df_env = pd.read_csv(env_file)
    # get list of all files in png_dir
    filenames = [f for f in os.listdir(png_dir) if os.path.isfile(os.path.join(png_dir, f))]
    # get matching files and check how many
    matches = df_env['filename'][df_env['filename'].isin(filenames)].tolist()
    # print summary
    print(f'File summary: {campaign} campaign')
    print(f'# files in png dir: {len(filenames)}')
    print(f'# files in environmental dataframe: {len(df_env)}')
    print(f'# files that match between the two: {len(matches)}')
    # add to df list
    df_list.append(df_env[df_env['filename'].isin(matches)])
# Combine all dataframes into one
df_combined = pd.concat(df_list, ignore_index=True)

File summary: CRYSTAL_FACE_NASA campaign
# files in png dir: 78152
# files in environmental dataframe: 61858
# files that match between the two: 61858
File summary: CRYSTAL_FACE_UND campaign
# files in png dir: 1617826
# files in environmental dataframe: 396138
# files that match between the two: 396138


In [5]:
print(df_combined.shape)
df_combined.head()

(457996, 46)


Unnamed: 0.1,Unnamed: 0,filename,date,Frame Width [pixels],Frame Height [pixels],Particle Width [micrometers],Particle Height [micrometers],Cutoff [%],Aggregate [%],Budding [%],...,Latitude [degrees],Longitude [degrees],Altitude [m],Pressure [hPa],Temperature [C],Ice Water Content [g/m3],PSD IWC [g/m3],concentration ratio,area ratio,mass ratio
0,0,2002_0709_153418_95_10.png,2002-07-09 15:34:18,58.0,74.0,108.134,141.827,7.2,0.0,0.0,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,0.0,0.0
1,1,2002_0709_153418_95_12.png,2002-07-09 15:34:18,46.0,50.0,70.603,89.162,6.77,0.0,0.0,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,0.0,0.0
2,2,2002_0709_153418_95_18.png,2002-07-09 15:34:18,162.0,224.0,422.582,335.481,1.94,69.511,0.0,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,0.0,0.0
3,3,2002_0709_153423_137_39.png,2002-07-09 15:34:23,192.0,138.0,279.681,417.883,3.79,69.227,0.0,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,0.0,0.0
4,4,2002_0709_153423_137_5.png,2002-07-09 15:34:23,122.0,130.0,247.511,233.688,0.0,0.0,0.0,...,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,-999.99,0.0,0.0,0.0


In [6]:
# using the combined df, get the counts of each unique value of Classification
classification_counts = df_combined['Classification'].value_counts()
print(classification_counts)

Classification
compact_irreg         300731
agg                    78077
planar_polycrystal     34240
rimed                  29847
column                 11314
budding                 3099
bullet                   688
Name: count, dtype: int64
