In [None]:
import datetime
from itertools import zip_longest
import os
import re
import sys

import pandas as pd

import muscope

In [None]:
print(sys.modules['muscope'].__file__)
muscope_loader_dp = os.path.dirname(sys.modules['muscope'].__file__)
downloads_dp = os.path.join(muscope_loader_dp, 'downloads')
dyhrman__xls_fp = os.path.join(downloads_dp, 'Dyhrman_MS_incubation_assoc_data_v3.xls')

print(dyhrman__xls_fp)
os.path.exists(dyhrman__xls_fp)


In [None]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)


In [None]:
core_attr_plus_data_df = pd.read_excel(
    dyhrman__xls_fp,
    sheet_name='core attributes + data',
    skiprows=(0,2)
)

# 2 related samples appear in groups of 4 rows
# the first sample is usually "mRNA"
# the second sample is usually "totalRNA"

# parse time strings such as '12:45:00' and '1245'
# the first 3 collection times are datetime objects, the remaining collection times are just strings like "1205"
time_re = re.compile(r'^(?P<hour>\d{1,2}):?(?P<minute>\d{1,2})(:(?P<second>)\d{1,2})?$')

for (r1, row1), (r2, row2), (r3, row3), (r4, row4) in grouper(core_attr_plus_data_df.iterrows(), n=4):
    # change the cruise name to MESO-SCOPE
    core_attr_plus_data_df.loc[r1, 'cruise_name'] = 'MESO-SCOPE'
    #core_attr_plus_data_df.loc[r3, 'cruise_name'] = 'MESO-SCOPE'

    if row1.data_type == 'mRNA reads':
        core_attr_plus_data_df.loc[r1, 'data_type'] = 'mRNA Reads'
    else:
        raise Exception()
    if row3.data_type == 'total RNA reads':
        core_attr_plus_data_df.loc[r3, 'data_type'] = 'Total RNA Reads'
    else:
        raise Exception()

    # convert the strings in collection_time to datetime.time objects
    ##print('"{}"'.format(row1.collection_time))
    collection_time_match = time_re.search(str(row1.collection_time))
    core_attr_plus_data_df.loc[row1.name, 'collection_time'] = datetime.time(
        hour=int(collection_time_match.group('hour')),
        minute=int(collection_time_match.group('minute')))
    
    # copy attributes from previous sample ONLY IF THE ATTRIBUTE IS EMPTY
    column_names = list(core_attr_plus_data_df.columns)
    for attr_name in column_names:
        print('row {} attr "{}" is "{}"'.format(r3, attr_name, core_attr_plus_data_df.loc[r3, attr_name]))
        if str(core_attr_plus_data_df.loc[r3, attr_name]) in ('nan', 'NaT'):
            print('  copy "{}" from previous sample'.format(core_attr_plus_data_df.loc[r1, attr_name]))
            core_attr_plus_data_df.loc[r3, attr_name] = core_attr_plus_data_df.loc[r1, attr_name]
        else:
            pass

core_attr_plus_data_df