# CSIS lateral photo and cover data prep

In [1]:
import glob
import os
import pandas as pd

lat_path = "/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/Laterals/"
dest_path1 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413006_CSIS_lateral_photos/"
dest_path2 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413007_CSIS_lateral_area/"

## Inventory the lateral photos

In [2]:
# Get a list of all the jpeg files in CSIS/Laterals
fns = [glob.glob(os.path.join(lat_path,'**/*.{0}'.format(e)), recursive=True) for e in ['JPG', 'jpg', 'JPEG', 'jpeg']]
# Join the nested list
fns = sum(fns, []) # If we end up with list of lists
print('number of lateral photos in folder: ' + str(len(fns)))

number of lateral photos in folder: 6154


In [3]:
# Create an "all lateral photos" dataframe
fnames = [os.path.basename(fn) for fn in fns]# Pull off just filename
latphotos_df = pd.DataFrame({'pathname':fns, 'fname':fnames, 'ptype':'lat'})

In [4]:
# find duplicate photo filenames - this could be problematic
# for locating source files for area estimates
ndups =  latphotos_df[latphotos_df.fname.duplicated()]
print('number of lateral duplicates: ' + str(len(ndups)))
# Get all duplicate rows (see https://stackoverflow.com/a/14657511)
dups = latphotos_df[latphotos_df.duplicated('fname', keep=False) == True]
dups.to_csv(os.path.join(dest_path1, 'lat_dups.csv'))

number of lateral duplicates: 168


## Assemble the litter area/accum files

From SigmaScan?

In [5]:
# Get the area files from SigmaScan, loop through, and load
areafiles = glob.glob(os.path.join(lat_path, '_CSIS_lateral-photos_Area-analyses', '**/area-analysis*.xlsx'), recursive=True)

frames = []

for f in areafiles:
    #print(f)
    df = pd.read_excel(f, 0, skiprows=4, header=0, usecols='A:H',
        dtype={'Comment':str}, na_values=['.', ' ', '', '. ', '.  ' , '<there is no #9 microplot>', '<there is no #10 microplot>'])
    # Rename columns
    df.columns = ['image_num_orig', 'drop1', 'image_fname', 'area_cm2', 'cal_dist_cm',
        'drop2', 'drop3', 'comment']
    frames.append(df)

# Concatenate dfs into one
latarea_df = pd.concat(frames, axis=0, ignore_index=True)
# Strip some whitespace in column index and columns (not sure if necessary...)
latarea_df.columns = latarea_df.columns.str.strip()
latarea_df['image_fname'] = latarea_df['image_fname'].str.strip()
latarea_df['comment'] = latarea_df['comment'].str.strip()
# In 2017 the block 7 photos have a weird 'M: ' prepended - remove
latarea_df['image_fname'] = latarea_df['image_fname'].str.replace('M: ', '')

# Drop rows with errors (~4)
# Basically if filename reads as NA (see read_excel params above) we can drop
print(len(latarea_df))
latarea_df = latarea_df[~latarea_df.image_fname.isna()]
print(len(latarea_df))


6000
5996


In [6]:
# Split the strings in "image_fname" by - and _
splits1 = latarea_df['image_fname'].str.split('[-_]', expand=True)
# Extract the block, plot, and microplots
latarea_df['block'] = splits1[0].str.extract('(\d+)')
latarea_df['plot']  = splits1[0].str.extract('([a-zA-Z]+)')
latarea_df['microplot'] = splits1[1].str.extract('(\d+)')
# There are two photo naming formats with dates
# Extract (2 re exp's) dates, standardize, convert to datetimes
latarea_df['image_date'] = latarea_df.image_fname.str.extract(r'(20\d{2}-\d{2}-\d{2}|20\d{6})')
latarea_df['image_date'] = latarea_df.image_date.str.replace('-','')
latarea_df['image_date'] = pd.to_datetime(latarea_df.image_date, format='%Y%m%d')
# Type/direction
latarea_df['photo_type'] = 'lat'
latarea_df['dir_facing']  = splits1[1].str.extract('([a-zA-Z]+)')

In [7]:
latarea_df.head()

Unnamed: 0,image_num_orig,drop1,image_fname,area_cm2,cal_dist_cm,drop2,drop3,comment,block,plot,microplot,image_date,photo_type,dir_facing
0,70,0.548837,15D-7N_20130301_IMG_1123764.JPG,1059.447926,50.017536,,2018-11-16 08:52:00,,15,D,7,2013-03-01,lat,N
1,71,0.333317,15D-7S_20130301_IMG_1123766.JPG,1043.894584,50.037418,,2018-11-16 08:59:00,,15,D,7,2013-03-01,lat,S
2,52,0.223188,15D-2W_20130301_IMG_1123747.JPG,1051.042591,49.960578,,2018-11-16 09:06:00,,15,D,2,2013-03-01,lat,W
3,14,0.258966,15B-3N_20130301_IMG_1123683.JPG,1167.01481,49.98131,,2018-11-16 09:13:00,,15,B,3,2013-03-01,lat,N
4,36,0.341442,15B-8W_20130301_IMG_1123706.JPG,1173.284807,50.039988,,2018-11-16 09:20:00,,15,B,8,2013-03-01,lat,W


In [8]:
# Reorder columns
cols = list(latarea_df.columns)
latarea_df = latarea_df[cols[0:1] + cols[2:3] + cols[-6:] + cols[3:5] + cols[7:8]]
print(latarea_df.columns)
# Export to csv
latarea_df.to_csv(os.path.join(dest_path2, 'jrn413007_lat_area_data.csv'), index=False, na_rep='NA')
latarea_df.shape


Index(['image_num_orig', 'image_fname', 'block', 'plot', 'microplot',
       'image_date', 'photo_type', 'dir_facing', 'area_cm2', 'cal_dist_cm',
       'comment'],
      dtype='object')


(5996, 11)

## Compare photos analyzed with inventory

In [9]:
latphotos_df.shape

(6154, 3)

In [10]:
print(latarea_df['image_fname'].isin(latphotos_df['fname']).sum())
# Currently there seem to be quite a few photos missing
latarea_df.shape[0] - latarea_df['image_fname'].isin(latphotos_df['fname']).sum()

3500


2496

In [11]:
# Get a listing of latarea Images (~6000) that are not present in the 
# overhead photo inventory (of ~6154)
missing = latarea_df.loc[~latarea_df['image_fname'].isin(latphotos_df['fname']),
    ['image_num_orig', 'image_fname', 'block', 'plot', 'microplot', 'image_date', 'photo_type', 'dir_facing']]

missing.to_csv(os.path.join(dest_path1, 'missing_latarea_photos.csv'))

## Zip up the photos

In [12]:
# Insert a year column
latarea_df['imageyear'] = latarea_df['image_date'].dt.year

In [13]:
# Create a directory file to populate
photo_archive_dir = pd.DataFrame(latarea_df[['image_fname', 'block', 'plot', 'microplot','image_date', 'photo_type', 'dir_facing']])
photo_archive_dir['archive_fname'] = 'None'
photo_archive_dir['archive_relpath'] = 'None'
lat_dups_used = pd.DataFrame() # Empty dataframe for duplicates

# Subset by year, zip up files, fill directory, and write to JORNADA_IM directory
import zipfile
for y in latarea_df.imageyear.dropna().unique():
    print(y)
    # Subset cover dataset by year and get paths from photo inventory
    subset = latarea_df.loc[latarea_df.imageyear==y, 'image_fname']
    #paths = latphotos_df.loc[latphotos_df.fname.isin(subset), 'pathname']
    # file extension case insensitive search (lots of discrepancies here)
    lc_jpg = latphotos_df.fname.str.replace('.JPG', '.jpg').isin(subset.str.replace('.JPG', '.jpg'))
    paths = latphotos_df.loc[lc_jpg, 'pathname']
    print('Area data lists {0} photos, {1} photo paths found in archive'.format(
        len(subset), len(paths)))
    # If there are duplicate photos what are they?
    if len(subset) < len(paths):
        test = latphotos_df.loc[latphotos_df.fname.isin(subset), :]
        lat_dups_used = pd.concat([lat_dups_used, test[test.duplicated('fname', keep=False) == True]])
    # Get the parent directory of all the paths in paths (common prefix, then parent dir of that)
    parentdir = os.path.dirname(os.path.commonprefix(paths.to_list()))
    # Create a zipfile
    zfile_path = os.path.join(dest_path1, 'lat_photos_{0}.zip'.format(str(int(y))))
    #zfile = zipfile.ZipFile(zfile_path, "w")
    for p in paths:
        # Again, account for the .JPG/.jpg discrepancy
        test = photo_archive_dir.image_fname.str.replace('.JPG', '.jpg')==os.path.basename(p.replace('.JPG', '.jpg'))
        photo_archive_dir.loc[test, 'archive_fname'] = 'lat_photos_{0}.zip'.format(str(int(y)))
        photo_archive_dir.loc[test, 'archive_relpath'] = os.path.relpath(p,start=parentdir)
        # Write each file to zfile using a relative path starting at parentdir
    #    zfile.write(p, os.path.relpath(p,start=parentdir), compress_type=zipfile.ZIP_DEFLATED)
    #zfile.close()

print(photo_archive_dir.tail())
photo_archive_dir.to_csv(os.path.join(dest_path1, 'jrn413006_photo_archive_dir.csv'), index=False, na_rep='NA')
lat_dups_used.to_csv(os.path.join(dest_path1, 'lat_dups_used.csv'), na_rep='NA')

2013.0
Area data lists 1196 photos, 1280 photo paths found in archive
2014.0
Area data lists 1200 photos, 1196 photo paths found in archive
2015.0
Area data lists 1196 photos, 1196 photo paths found in archive
2016.0
Area data lists 1196 photos, 1196 photo paths found in archive
2017.0
Area data lists 1200 photos, 1284 photo paths found in archive
                            image_fname block plot microplot image_date  \
5995   1C-1N_T_20170725_IMG_1106842.JPG     1    C         1 2017-07-25   
5996  1A-8W_TH_20170725_IMG_1106797.JPG     1    A         8 2017-07-25   
5997  1A-5N_TH_20170725_IMG_1106781.JPG     1    A         5 2017-07-25   
5998  1A-9W_TH_20170725_IMG_1106801.JPG     1    A         9 2017-07-25   
5999   1C-7E_T_20170725_IMG_1106867.JPG     1    C         7 2017-07-25   

     photo_type dir_facing        archive_fname  \
5995        lat          N  lat_photos_2017.zip   
5996        lat          W  lat_photos_2017.zip   
5997        lat          N  lat_photos_2017.zi