In [11]:
import pandas as pd

In [12]:
source_dir = '../data_ctd_raw/'
save_dir = '../processed_data/'

In [13]:
data_file = source_dir + 'SKQ202310S_hy1.csv'

In [14]:
varnames = pd.read_csv(data_file, header=None, skiprows=1, nrows=1)

units = pd.read_csv(data_file, header=None, skiprows=2, nrows=1)

# create empty list to store combined varname and unit string
column_names = []
# combine the string
for varnames, units in zip(varnames.values[0], units.values[0]):
# if there is a unit, combine with varname to make new colname
    if pd.notna(units):
        column_names.append(str(varnames) + '_' + str(units))
# if no unit, varname is colname
    else:
        column_names.append(str(varnames))

bottle_file = pd.read_csv(data_file, header=None, skiprows=3, names = column_names)

# One useless row at end of each file - must cut
bottle_file.drop(bottle_file[bottle_file['EXPOCODE'] == 'END_DATA'].index, inplace = True)

# Used to save here, now making some further modifications
# bottle_file.to_csv('../results/final_qc_bottle_file.csv', index=False)

In [15]:
## Many SAMPNO NA values included - these are depths where a bottle was tripped but no sample was taken
bottle_file.drop(bottle_file[bottle_file['SAMPNO'] == -999.0].index, inplace=True)
# bottle_file

In [16]:
# concatenate all the cast dataframes
# ctd = pd.concat(list_of_cast_dataframes)

# One useless row at end of each file - must cut
# bottle_file.drop(bottle_file[bottle_file['CTDPRS_DBAR_BOTTLE'] == 'END_DATA'].index, inplace = True)

# Make STNNBR an int, CTDPRS_DBAR swap to numeric - both were sorting lexically
bottle_file['STNNBR'] = bottle_file['STNNBR'].astype(int)
bottle_file['CTDPRS_DBAR_BOTTLE'] = pd.to_numeric(bottle_file['CTDPRS_DBAR_BOTTLE'])

# after concat and drop, sort values by station then by pressure(depth), then reset indexing
bottle_file = bottle_file.sort_values(by = ['STNNBR', 'CTDPRS_DBAR_BOTTLE'], ignore_index=True)

In [17]:
# bottle_file

In [18]:
## Add SECTION and LINE variables using regex

# LINE is all letters before the first number in SECT_ID
bottle_file['LINE'] = bottle_file['SECT_ID'].str.extract(r'([a-zA-Z]+)(?=\d)', expand=False)

# SECTION is all characters before the dash, or, if there is no dash, all characters in SECT_ID
bottle_file['SECTION'] = bottle_file['SECT_ID'].str.split('-', expand=True)[0]

# check that all lines appear:
# bottle_file['LINE'].drop_duplicates()

# check that all sections appear:
for i in bottle_file['SECTION'].drop_duplicates():
    print(i)

BS1
BS3
BS5
BS7
CN1
CN2
HR1
T002 Trap Recovery
HRE1
CNS1
CNS2
HR2
T003 Trap Recovery
HRE2
T004 Trap Recovery
T005 Trap Recovery
T006 Trap Recovery
T007 Trap Recovery
HR3
HRE3
T010 Trap Recovery
T009 Trap Recovery
T012 Trap Recovery
T011 Trap Recovery
CN3
HR4
HRE4
T013 Trap Recovery
CN4
HR5
T014 Trap Recovery
T017 Trap Recovery
T018 Trap Recovery
HRE5
T016 Trap Recovery
T015 Trap Recovery
CN6
HR6
T019 Trap Recovery
T022 Trap Recovery
HRE6
T020 Trap Recovery
CN7
HR7
T023 Trap Recovery
T026 Recovery
HRE7
T025 Trap Recovery
T024 Trap Recovery
CN8
HR8
T027 Trap Recovery
T029 Trap Recovery
T028 Trap Recovery
LB1
DBO3
ZZ
BSR
DBO2


In [19]:
# Convert Values to floats for use with GSW
bottle_file['CTDPRS_DBAR_BOTTLE'] = bottle_file['CTDPRS_DBAR_BOTTLE'].astype(float)
bottle_file['LONGITUDE'] = bottle_file['LONGITUDE'].astype(float)
bottle_file['LATITUDE'] = bottle_file['LATITUDE'].astype(float)
bottle_file['STNNBR'] = bottle_file['STNNBR'].astype(float)

In [20]:
# write to csv here
bottle_file.to_csv(save_dir + 'final_qc_bottle_file.csv', index=False)