# Prepare CSIS Gap data for publication

In [19]:
import glob
import os
import pandas as pd
import re

# 2014-2018 data
gap_path1 = "/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/"
# 2019-present data
gap_path2 = "/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/"
dest_path1 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413002_CSIS_lpi_gap/"
#dest_path2 = "/media/greg/jrn-DataProducts/JORNADA_IM/WIP_packages/210413005_CSIS_overhead_cover/"

## Load the earlier DAT and CSV files

In [20]:
fns = [glob.glob(os.path.join(gap_path1,'CSIS_Gap*.{0}'.format(e)), recursive=False) for e in ['csv', 'dat', 'DAT']]
fns = sum(fns, []) # In case we end up with list of lists
print('number Gap files in folder: ' + str(len(fns)))
fns

number Gap files in folder: 9


['/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_14F.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_14S.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_15F.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_15S.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_16F.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_16S.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_17F.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_17S.dat',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_18S.DAT']

In [21]:
df_dat1 = pd.read_fwf(fns[0], skiprows=25)
df_dat2 = pd.read_fwf(fns[8], skiprows=25)
print(df_dat1.columns)
print(df_dat2.columns)

Index(['study', 'year', 's', 'date', 'bk', 'p', 't', 'begin', 'strt', 'end',
       'gap', 'q', 'comment', 'rIn', 'readerName', 'Unnamed: 15', 'dIn',
       'dataentryName', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20'],
      dtype='object')
Index(['study', 'year', 's', 'date', 'bk', 'p', 't', 'begin', 'strt', 'end',
       'gap', 'q', 'comment', 'rIn', 'readerName', 'Unnamed: 15', 'dIn',
       'dataentryName', 'Unnamed: 18', 'Unnamed: 19'],
      dtype='object')


In [22]:
colmap_1 = {'s':'season','bk':'block','p':'plot','t':'transect','strt':'gap_start',
            'end':'gap_end','gap':'gap_dist','q':'qflag'}

In [23]:
# Read in the text files
frames = []

for f in fns:
    print(f)
    # Get number of lines to skip, use a windoze encoding for these old files
    with open(f, 'r', encoding='cp1252') as fo:
        lines = [line.rstrip() for line in fo]
    start_match = pd.Series(lines).str.contains('^CSIS-GAP')
    skip_no = start_match[start_match].index.values.min()
    print(skip_no)
    # There are not reliable missing value indicators so we need to get col widths
    # from the header
    cols_raw = re.findall('\S+\s+', lines[skip_no-1] + ' ') # parse columns, add space to capture end col
    wid = [len(col) for col in cols_raw]
    # Also get column names (split on spaces)
    col_names = re.split('\s+', lines[skip_no-1])
    #print(wid)
    #print(col_names)

    # Now read past the header row and use the corrected header
    df = pd.read_fwf(f, skiprows=(skip_no), widths=wid, na_values=['.', ' ', ''], names=col_names,
                     index_col=False, encoding='cp1252')
    print(df.head())
    # Rename some columns
    df = df.rename(columns=colmap_1)
    print(df.shape)
    print(df.columns)

    frames.append(df)

# Concatenate dfs into one
df1 = pd.concat(frames, axis=0, ignore_index=True)
# Set dates
df1.date = pd.to_datetime(df1.date, format='%m/%d/%Y')

/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_data/CSIS_Gap_14F.dat
26
      study  year  s        date  bk  p  t  begin    strt    end    gap   q  \
0  CSIS-GAP  2014  F  08/20/2014   1  A  1   2400  2467 2  415.0   52.0 NaN   
1  CSIS-GAP  2014  F  08/20/2014   1  A  1   2400  2365 2  150.0  215.0 NaN   
2  CSIS-GAP  2014  F  08/20/2014   1  A  1   2400  2149 2   95.0   54.0 NaN   
3  CSIS-GAP  2014  F  08/20/2014   1  A  1   2400  1885 1  757.0  128.0 NaN   
4  CSIS-GAP  2014  F  08/20/2014   1  A  1   2400  1738 1  713.0   25.0 NaN   

  comment  rIn      readerName  dIn  dataentryName  
0     NaN  KJG  Gename, Kyle J  BMS  Stover, Blake  
1     NaN  KJG  Gename, Kyle J  BMS  Stover, Blake  
2     NaN  KJG  Gename, Kyle J  BMS  Stover, Blake  
3     NaN  KJG  Gename, Kyle J  BMS  Stover, Blake  
4     NaN  KJG  Gename, Kyle J  BMS  Stover, Blake  
(2093, 17)
Index(['study', 'year', 'season', 'date', 'block', 'plot', 'transect', 'begin',
       'gap_start', 'ga

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16148 entries, 0 to 16147
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   study          16148 non-null  object        
 1   year           16148 non-null  int64         
 2   season         16148 non-null  object        
 3   date           16148 non-null  datetime64[ns]
 4   block          16148 non-null  int64         
 5   plot           16148 non-null  object        
 6   transect       16148 non-null  int64         
 7   begin          16148 non-null  object        
 8   gap_start      16143 non-null  object        
 9   gap_end        16131 non-null  object        
 10  gap_dist       16124 non-null  float64       
 11  qflag          13 non-null     object        
 12  comment        32 non-null     object        
 13  rIn            16147 non-null  object        
 14  readerName     16147 non-null  object        
 15  dIn            1614

In [25]:
df1.head()

Unnamed: 0,study,year,season,date,block,plot,transect,begin,gap_start,gap_end,gap_dist,qflag,comment,rIn,readerName,dIn,dataentryName
0,CSIS-GAP,2014,F,2014-08-20,1,A,1,2400,2467 2,415.0,52.0,,,KJG,"Gename, Kyle J",BMS,"Stover, Blake"
1,CSIS-GAP,2014,F,2014-08-20,1,A,1,2400,2365 2,150.0,215.0,,,KJG,"Gename, Kyle J",BMS,"Stover, Blake"
2,CSIS-GAP,2014,F,2014-08-20,1,A,1,2400,2149 2,95.0,54.0,,,KJG,"Gename, Kyle J",BMS,"Stover, Blake"
3,CSIS-GAP,2014,F,2014-08-20,1,A,1,2400,1885 1,757.0,128.0,,,KJG,"Gename, Kyle J",BMS,"Stover, Blake"
4,CSIS-GAP,2014,F,2014-08-20,1,A,1,2400,1738 1,713.0,25.0,,,KJG,"Gename, Kyle J",BMS,"Stover, Blake"


## Read in recent Excel files

In [26]:
# first list the files
fns = glob.glob(os.path.join(gap_path2,'Data_CSIS_LPI-Gap*.xlsx'), recursive=False)
#fns = sum(fns, []) # In case we end up with list of lists
print('number Gap files in folder: ' + str(len(fns)))
# Drop the 2023 data for now
fns = [f for f in fns if '2023' not in f]
fns

number Gap files in folder: 9


['/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2019S.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2019F.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2020F.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2020S.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2021F.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2021S.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2022S.xlsx',
 '/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2022F.xlsx']

In [27]:
colmap_2 = {'start':'gap_start','end':'gap_end','interval':'gap_dist'}

In [28]:
# Read in the excel files
frames = []

for f in fns:
    print(f)
    # Find the sheet we need (they vary year to year)
    sheet = [s for s in pd.ExcelFile(f).sheet_names if 'pq' in s and 'Gap' in s][0]
    print(sheet)
    # Header rows are variable. Read first 20 lines and find header row (contains 
    # 'year') to set skiprows parameter
    df_head = pd.read_excel(f, sheet, nrows=20)
    header_loc = df_head[df_head == 'year'].dropna(axis=1, how='all').dropna(how='all')
    if header_loc.empty:
        skip = None # Don't skip anything if we get an empty df
    else:
        skip = header_loc.index.item() + 1
    # Now read the file
    df = pd.read_excel(f, sheet_name=sheet, skiprows=skip)
    # Rename columns and replace values
    df = df.rename(columns=colmap_2)
    df.season = df.season.replace({'Spring':'S','Fall':'F'})
    df['study'] = 'CSIS-GAP'
    print(df.shape)
    print(df.columns)

    frames.append(df)

# Concatenate dfs into one
df2 = pd.concat(frames, axis=0, ignore_index=True)
    

/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2019S.xlsx


pq_2019S_Gap
(1801, 11)
Index(['year', 'season', 'date', 'block', 'plot', 'transect', 'gap_start',
       'gap_end', 'gap_dist', 'comment', 'study'],
      dtype='object')
/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2019F.xlsx
pq_2019F_Gap
(1965, 11)
Index(['year', 'season', 'date', 'block', 'plot', 'transect', 'gap_start',
       'gap_end', 'gap_dist', 'comment', 'study'],
      dtype='object')
/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2020F.xlsx
pq_2020F_Gap
(1753, 11)
Index(['year', 'season', 'date', 'block', 'plot', 'transect', 'gap_start',
       'gap_end', 'gap_dist', 'comment', 'study'],
      dtype='object')
/media/greg/jrn-DataArchive/Data_ENT/LTER/_Entry/CSIS/csis-VegTran/_compiled/Data_CSIS_LPI-Gap_2020S.xlsx
pq_2020S_Gap
(1841, 11)
Index(['year', 'season', 'date', 'block', 'plot', 'transect', 'gap_start',
       'gap_end', 'gap_dist', 'comment', 'study'],
      dtype='obj

In [29]:
df2.head()

Unnamed: 0,year,season,date,block,plot,transect,gap_start,gap_end,gap_dist,comment,study
0,2019,S,2019-04-12,1,A,1,0.0,1227.0,1227,,CSIS-GAP
1,2019,S,2019-04-12,1,A,1,1290.0,1392.0,102,,CSIS-GAP
2,2019,S,2019-04-12,1,A,1,1420.0,1835.0,415,,CSIS-GAP
3,2019,S,2019-04-12,1,A,1,2115.0,2150.0,35,,CSIS-GAP
4,2019,S,2019-04-12,1,A,1,2172.0,2500.0,328,end transect,CSIS-GAP


## Merge the 2 dataframes

In [30]:
df = pd.concat([df1, df2], axis=0, ignore_index=True)
df.columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27981 entries, 0 to 27980
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   study          27981 non-null  object        
 1   year           27981 non-null  int64         
 2   season         27981 non-null  object        
 3   date           27981 non-null  datetime64[ns]
 4   block          27981 non-null  int64         
 5   plot           27981 non-null  object        
 6   transect       27981 non-null  int64         
 7   begin          16148 non-null  object        
 8   gap_start      27968 non-null  object        
 9   gap_end        27959 non-null  object        
 10  gap_dist       27957 non-null  object        
 11  qflag          13 non-null     object        
 12  comment        131 non-null    object        
 13  rIn            16147 non-null  object        
 14  readerName     16147 non-null  object        
 15  dIn            1614

In [31]:
keep = ['study', 'year', 'season', 'date', 'block', 'plot', 'transect', 'begin',
       'gap_start', 'gap_end', 'gap_dist', 'qflag', 'comment']
df = df.loc[:,keep]

# Couple minor corrections
df.loc[df['begin']=='0 BO', 'begin'] = 0
df.loc[df['begin']=='2400', 'begin'] = 2500
df.loc[df['begin']=='2500', 'begin'] = 2500
df.loc[df['begin']==2400, 'begin'] = 2500

In [33]:
df.to_csv(os.path.join(dest_path1,'jrn413002_gap_data.csv'), index=False, na_rep='NA')


In [34]:
df['qflag'].unique()

array([nan, 'E', 'M'], dtype=object)

In [None]:
df.groupby(['block', 'plot', 'transect']).size().reset_index(name='Freq')

Unnamed: 0,block,plot,transect,Freq
0,1,A,1,134
1,1,A,2,149
2,1,A,3,175
3,1,B,1,110
4,1,B,2,99
...,...,...,...,...
175,15,C,2,180
176,15,C,3,183
177,15,D,1,177
178,15,D,2,186


In [None]:
df.isna().sum()


study            0
year             0
season           0
date             0
block            0
plot             0
transect         0
begin        11833
gap_start       13
gap_end         22
gap_dist        24
qflag        27968
comment      27850
dtype: int64