# CSIS Overhead photo and cover data prep

In [1]:
import glob
import os
import pandas as pd

oh_path = "/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/Overheads/"
dest_path1 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413004_CSIS_overhead_photos/"
dest_path2 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413005_CSIS_overhead_cover/"

## Inventory the overhead photos

In [2]:
# Get a list of all the jpeg files in CSIS/Overheads
fns = [glob.glob(os.path.join(oh_path,'**/*.{0}'.format(e)), recursive=True) for e in ['JPG', 'jpg', 'JPEG', 'jpeg']]
fns = sum(fns, []) # In case we end up with list of lists
print('number of overhead photos in folder: ' + str(len(fns)))

number of overhead photos in folder: 13101


In [3]:
# Create an "all overhead photos" dataframe
fnames = [os.path.basename(fn) for fn in fns]# Pull off just filename
ohphotos_df = pd.DataFrame({'pathname':fns, 'fname':fnames, 'ptype':'oh'})

In [4]:
# find duplicate photo filenames - this could be problematic
# for locating source files for cover estimates
ndups =  ohphotos_df[ohphotos_df.fname.duplicated()]
print('number of overhead duplicates: ' + str(len(ndups)))
# Get all duplicate rows (see https://stackoverflow.com/a/14657511)
dups = ohphotos_df[ohphotos_df.duplicated('fname', keep=False) == True]
dups.to_csv(os.path.join(dest_path1, 'oh_dups.csv'))

number of overhead duplicates: 663


## Assemble the cover files

From Samplepoint?

In [5]:
# Get the cover files from SamplePoint, loop through, and load
# Note that data.dat is a weird ragged file
coverfiles = glob.glob(os.path.join(oh_path, 'all', '*.csv'))

frames = []

for f in coverfiles:
    # Read csv and append to frames list (note these csvs have extra spaces)
    df = pd.read_csv(f, na_values=['ERROR','.', ' ', ''], skipinitialspace=True)
    # Lowercase
    df.columns = df.columns.str.lower()
    #Strip some whitespace in column index
    df.columns = df.columns.str.strip()

    frames.append(df)

# Concatenate dfs into one
ohcover_df = pd.concat(frames, axis=0, ignore_index=True)

# Rename cols and strip whitespace in columns
ohcover_df = ohcover_df.rename(columns={'key':'image_num_orig', 'image':'image_fname'})
ohcover_df['image_fname'] = ohcover_df['image_fname'].str.strip()
ohcover_df['comment'] = ohcover_df['comment'].str.strip()
print(ohcover_df.shape)


(5982, 77)


In [6]:
# Drop some columns and rows with errors
# Drop rows with ZERO or ERROR in them (should be about 32)
ohcover_df = ohcover_df[~ohcover_df.image_fname.str.contains('ZERO')]
#ohcover_df = ohcover_df[~ohcover_df['%shrub'].str.contains('ERROR', na=False)] # Now handled on read
# These columns show up from trailing commas but are empty
ohcover_df = ohcover_df.drop(columns=['unnamed: 35','unnamed: 31'])
print(ohcover_df.shape)

# Extract the percent columns and the count columns (should be 35 of each)
count_cols = [c for c in ohcover_df.columns[5:] if r'%' not in c]
pct_cols = [c for c in ohcover_df.columns[5:] if r'%' in c]
print(len(count_cols), len(pct_cols))

# We're going to drop some because they were discontinued by the techs
# and sum to zero:
ohcover_df[pct_cols].sum()
ohcover_df[count_cols].sum()
# One column has one observation - lump it into the unkgrs column
test = ohcover_df['grsleaf'] > 0
ohcover_df.loc[test, 'unkgrs'] = ohcover_df.loc[test, 'grsleaf']
ohcover_df.loc[test, '%unkgrs'] = ohcover_df.loc[test, '%grsleaf']

# Now remove the empty columns
dropcols = ['%biocrus','%termite','%seedlin','%antmnd','%grsanl','%grsunk','%grsleaf','%grsbse',
    'biocrus','termite','seedlin','antmnd','grsanl','grsunk','grsleaf','grsbse']
ohcover_df = ohcover_df.drop(columns=dropcols)
# Also remove from our count and pct lists
count_cols = [c for c in count_cols if c not in dropcols]
pct_cols = [c for c in pct_cols if c not in dropcols]

print(ohcover_df.shape)

(5953, 75)
35 35
(5953, 59)


In [7]:
# Extract some categorical and date columns from filenames
# Split the strings in "Image" by - and _

splits1 = ohcover_df['image_fname'].str.split('[-_]', expand=True)
# Extract the block, plot, and microplots
ohcover_df['block'] = splits1[0].str.extract('(\d+)')
ohcover_df['plot']  = splits1[0].str.extract('([a-zA-Z]+)')
ohcover_df['microplot'] = splits1[1].str.extract('(\d+)')
# There are two photo naming formats with dates
# Extract (2 re exp's) dates, standardize, convert to datetimes
ohcover_df['image_date'] = ohcover_df.image_fname.str.extract(r'(20\d{2}-\d{2}-\d{2}|20\d{6})')
ohcover_df['image_date'] = ohcover_df.image_date.str.replace('-','')
ohcover_df['image_date'] = pd.to_datetime(ohcover_df.image_date, format='%Y%m%d')
ohcover_df['photo_type'] = 'oh'

#an error
ohcover_df.loc[ohcover_df['plot']=='a','plot'] = 'A'

# Reorder columns
cols = list(ohcover_df.columns)
ohcover_df = ohcover_df[cols[0:2] + cols[-5:] + cols[2:-5]]
print(ohcover_df.columns)

Index(['image_num_orig', 'image_fname', 'block', 'plot', 'microplot',
       'image_date', 'photo_type', 'comment', 'gridsize', 'actual', 'shrub',
       '%shrub', 's-shrub', '%s-shrub', 'forb', '%forb', 'litter', '%litter',
       'soil', '%soil', 'conmod', '%conmod', 'rock', '%rock', 'mupo', '%mupo',
       'aris', '%aris', 'dapu', '%dapu', 'boer', '%boer', 'p-grass',
       '%p-grass', 'spor', '%spor', 'a-grass', '%a-grass', 'unkgrs', '%unkgrs',
       'shrubd', '%shrubd', 's-shrd', '%s-shrd', 'mupod', '%mupod', 'arisd',
       '%arisd', 'dapud', '%dapud', 'boerd', '%boerd', 'pgrasd', '%pgrasd',
       'spord', '%spord', 'forbd', '%forbd', 'agrasd', '%agrasd', 'ungrsd',
       '%ungrsd', 'outside', '%outside'],
      dtype='object')


In [8]:
# Now we want to reshape this so its not insane columns to describe

# Melt counts and percents into nearly the same dataframe with different value columns
print(ohcover_df.shape)
counts = ohcover_df.melt(id_vars=ohcover_df.columns[0:10], value_vars=count_cols,
                       var_name='cover_type', value_name='cover_count')
pcts = ohcover_df.melt(id_vars=ohcover_df.columns[0:10], value_vars=pct_cols,
                       var_name='cover_type', value_name='cover_percent')
# Need to make the cover_type column match, remove %
pcts['cover_type'] = pcts['cover_type'].str.replace('%', '')
# Now merge them
ohcover_rshp = counts.merge(pcts, how='left')

# Move the comment column to the end (easier to read in xcel)
colcom = ohcover_rshp.pop('comment')
ohcover_rshp.insert(len(ohcover_rshp.columns), 'comment', colcom)

print(ohcover_rshp.shape)
ohcover_rshp.head()

(5953, 64)
(160731, 13)


Unnamed: 0,image_num_orig,image_fname,block,plot,microplot,image_date,photo_type,gridsize,actual,cover_type,cover_count,cover_percent,comment
0,1,10A-10O_20130221_IMG_1123404r.jpg,10,A,10,2013-02-21,oh,100.0,100.0,shrub,0.0,0.0,
1,2,10A-1O_20130221_IMG_1123394r.jpg,10,A,1,2013-02-21,oh,100.0,100.0,shrub,0.0,0.0,
2,3,10A-2O_20130221_IMG_1123395r.jpg,10,A,2,2013-02-21,oh,100.0,100.0,shrub,0.0,0.0,
3,4,10A-3O_20130221_IMG_1123396r.jpg,10,A,3,2013-02-21,oh,100.0,100.0,shrub,0.0,0.0,
4,5,10A-4O_20130221_IMG_1123397r.jpg,10,A,4,2013-02-21,oh,100.0,100.0,shrub,0.0,0.0,


In [9]:
# Export to csv
ohcover_rshp.to_csv(os.path.join(dest_path2,'jrn413005_oh_cover_data.csv'), index=False, na_rep='NA')


## Compare photos analyzed with inventory

In [10]:
ohphotos_df.shape

(13101, 3)

In [11]:
print(ohcover_df['image_fname'].isin(ohphotos_df['fname']).sum())
# Currently there seem to be ~124 photos missing
ohcover_df.shape[0] - ohcover_df['image_fname'].isin(ohphotos_df['fname']).sum()

5849


104

In [12]:
# Get a listing of ohcover Images (~6000) that are not present in the 
# overhead photo inventory (of ~13000)
missing = ohcover_df.loc[~ohcover_df['image_fname'].isin(ohphotos_df['fname']),
    ['image_num_orig', 'image_fname', 'block', 'plot', 'microplot', 'image_date', 'photo_type']]

missing.to_csv(os.path.join(dest_path1, 'missing_ohcover_photos.csv'))

## Zip up the photos

In [13]:
# Insert a year column
ohcover_df['imageyear'] = ohcover_df['image_date'].dt.year

In [14]:
# Create a directory file to populate
photo_archive_dir = pd.DataFrame(ohcover_df[['image_fname', 'block', 'plot', 'microplot', 'image_date', 'photo_type']])
photo_archive_dir['archive_fname'] = 'None'
photo_archive_dir['archive_relpath'] = 'None'
oh_dups_used = pd.DataFrame() # Empty dataframe for duplicates

# Subset by year, zip up files, fill directory, and write to JORNADA_IM directory
import zipfile
for y in ohcover_df.imageyear.dropna().unique():
    print(y)
    # Subset cover dataset by year and get paths from photo inventory
    subset = ohcover_df.loc[ohcover_df.imageyear==y, 'image_fname']
    paths = ohphotos_df.loc[ohphotos_df.fname.isin(subset), 'pathname']
    print('Cover data lists {0} photos, {1} photo paths found in archive'.format(
        len(subset), len(paths)))
    # If there are duplicate photos what are they?
    if len(subset) < len(paths):
        test = ohphotos_df.loc[ohphotos_df.fname.isin(subset), :]
        oh_dups_used = pd.concat([oh_dups_used, test[test.duplicated('fname', keep=False) == True]])
    # Get the parent directory of all the paths in paths (common prefix, then parent dir of that)
    parentdir = os.path.dirname(os.path.commonprefix(paths.to_list()))
    # Create a zipfile
    zfile_path = os.path.join(dest_path1, 'oh_photos_{0}.zip'.format(str(int(y))))
    #zfile = zipfile.ZipFile(zfile_path, "w")
    for p in paths:
        test = photo_archive_dir.image_fname==os.path.basename(p)
        photo_archive_dir.loc[test, 'archive_fname'] = 'oh_photos_{0}.zip'.format(str(int(y)))
        photo_archive_dir.loc[test, 'archive_relpath'] = os.path.relpath(p,start=parentdir)
        # Write each file to zfile using a relative path starting at parentdir
    #    zfile.write(p, os.path.relpath(p,start=parentdir), compress_type=zipfile.ZIP_DEFLATED)
    #zfile.close()

print(photo_archive_dir.tail())
photo_archive_dir.to_csv(os.path.join(dest_path1, 'jrn413004_photo_archive_dir.csv'), index=False, na_rep='NA')
oh_dups_used.to_csv(os.path.join(dest_path1, 'oh_dups_used.csv'), na_rep='NA')

2013.0
Cover data lists 597 photos, 597 photo paths found in archive


2014.0
Cover data lists 597 photos, 597 photo paths found in archive
2015.0
Cover data lists 589 photos, 589 photo paths found in archive
2016.0
Cover data lists 589 photos, 589 photo paths found in archive
2017.0
Cover data lists 592 photos, 592 photo paths found in archive
2018.0
Cover data lists 597 photos, 598 photo paths found in archive
2019.0
Cover data lists 599 photos, 602 photo paths found in archive
2020.0
Cover data lists 597 photos, 596 photo paths found in archive
2021.0
Cover data lists 597 photos, 499 photo paths found in archive
2022.0
Cover data lists 597 photos, 595 photo paths found in archive
                          image_fname block plot microplot image_date  \
5977  15D_5O_H_20220808_IMG_7899r.jpg    15    D         5 2022-08-08   
5978  15D_6O_H_20220808_IMG_7900r.jpg    15    D         6 2022-08-08   
5979  15D_7O_H_20220808_IMG_7901r.jpg    15    D         7 2022-08-08   
5980  15D_8O_H_20220808_IMG_7902r.jpg    15    D         8 2022-08-08   
5981  15D_9O_H