In [None]:
import datetime
from itertools import zip_longest
import os
import re
import sys

import pandas as pd

import muscope

In [None]:
print(sys.modules['muscope'].__file__)
muscope_loader_dp = os.path.dirname(sys.modules['muscope'].__file__)
downloads_dp = os.path.join(muscope_loader_dp, 'downloads')
dyhrman__xls_fp = os.path.join(downloads_dp, 'Dyhrman_HL2A_Tricho_seq_attrib_v2.xls')

print(dyhrman__xls_fp)
os.path.exists(dyhrman__xls_fp)


In [None]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)


In [None]:
core_attr_plus_data_df = pd.read_excel(
    dyhrman__xls_fp,
    sheet_name='core attributes + data',
    skiprows=(0,2)
)

for (r1, row1), (r2, row2) in grouper(core_attr_plus_data_df.iterrows(), n=2):
    # set station and cast to 0
    core_attr_plus_data_df.loc[r1, 'station'] = 0
    core_attr_plus_data_df.loc[r1, 'cast_num'] = 0
    core_attr_plus_data_df.loc[r2, 'station'] = 0
    core_attr_plus_data_df.loc[r2, 'cast_num'] = 0
    
    # append .gz to file names
    if str(row1.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r1, 'seq_name'] = row1.seq_name + '.gz'
    else:
        pass
    if str(row2.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r2, 'seq_name'] = row2.seq_name + '.gz'
    else:
        pass

    if core_attr_plus_data_df.loc[r1, 'data_type'] == 'reads':
        core_attr_plus_data_df.loc[r1, 'data_type'] = 'Reads'
    else:
        raise Exception()
    if core_attr_plus_data_df.loc[r2, 'data_type'] == 'reads':
        core_attr_plus_data_df.loc[r2, 'data_type'] = 'Reads'
    else:
        raise Exception()

    # copy attributes from previous sample ONLY IF THE ATTRIBUTE IS EMPTY
    column_names = list(core_attr_plus_data_df.columns)
    for attr_name in column_names:
        #print('row {} attr "{}" is "{}"'.format(r3, attr_name, core_attr_plus_data_df.loc[r3, attr_name]))
        if str(core_attr_plus_data_df.loc[r1, attr_name]) in ('nan', 'NaT') and r1 > 0:
            #print('  copy "{}" from previous sample'.format(core_attr_plus_data_df.loc[r1, attr_name]))
            core_attr_plus_data_df.loc[r1, attr_name] = core_attr_plus_data_df.loc[r1 - 2, attr_name]
        else:
            pass

# convert to decimal degrees
for (r1, row1), (r2, row2) in grouper(core_attr_plus_data_df.iterrows(), n=2):
    #print(core_attr_plus_data_df.loc[r1, 'latitude'])
    lat1, lat2 = core_attr_plus_data_df.loc[r1, 'latitude'].split()
    core_attr_plus_data_df.loc[r1, 'latitude'] = float(lat1) + (float(lat2) / 60)

    lon1, lon2 = core_attr_plus_data_df.loc[r1, 'longitude'].split()
    core_attr_plus_data_df.loc[r1, 'longitude'] = -1.0 * float(lon1) + (float(lon2) / 60)

# these start out empty but get filled in by the attribute copy loop above
core_attr_plus_data_df.loc[12, 'latitude'] = None
core_attr_plus_data_df.loc[12, 'longitude'] = None
core_attr_plus_data_df.loc[14, 'latitude'] = None
core_attr_plus_data_df.loc[14, 'longitude'] = None

core_attr_plus_data_df