In [None]:
import datetime
from itertools import zip_longest
import os
import re
import sys

import pandas as pd

import muscope

In [None]:
print(sys.modules['muscope'].__file__)
muscope_dp = os.path.dirname(sys.modules['muscope'].__file__)
downloads_dp = os.path.join(muscope_dp, 'downloads')
dyhrman_hl4_xls_fp = os.path.join(downloads_dp, 'Dyhrman_HL4_incubation_seq_assoc_data_v3.xls')

print(dyhrman_hl4_xls_fp)
os.path.exists(dyhrman_hl4_xls_fp)


In [None]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)


In [None]:
core_attr_plus_data_df = pd.read_excel(
    dyhrman_hl4_xls_fp,
    sheet_name='core attributes + data',
    skiprows=(0,2)
)

# there are 4 seq_names for each sample name
# the first 2 seq_names are paired-end reads e.g. 
#   SM125_S42_L008_R1_001.fastq and SM125_S42_L008_R2_001.fastq
# the second 2 seq_names are paired-end reads e.g.
#   SM143_S33_L002_R1_001.fastq and SM143_S33_L002_R2_001.fastq

# parse time strings such as '12:45:00' and '1245'
# the first 3 collection times are datetime objects, the remaining collection times are just strings like "1205"
time_re = re.compile(r'^(?P<hour>\d{2}):?(?P<minute>\d{2})(:(?P<second>)\d{2})?$')

for (r1, row1), (r2, row2), (r3, row3), (r4, row4) in grouper(core_attr_plus_data_df.iterrows(), n=4):
    print(row1.seq_name)
    if row1.data_type == 'mRNA reads':
        core_attr_plus_data_df.loc[r1, 'data_type'] = 'mRNA Reads'
    else:
        raise Exception()
    if row3.data_type == 'total RNA reads':
        core_attr_plus_data_df.loc[r3, 'data_type'] = 'Total RNA Reads'
    else:
        raise Exception()

    # append .gz to file names
    if str(row1.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r1, 'seq_name'] = row1.seq_name + '.gz'
    else:
        pass
    if str(row2.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r2, 'seq_name'] = row2.seq_name + '.gz'
    else:
        pass
    if str(row3.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r3, 'seq_name'] = row3.seq_name + '.gz'
    else:
        pass
    if str(row4.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r4, 'seq_name'] = row4.seq_name + '.gz'
    else:
        pass

    # convert the strings in collection_time to datetime.time objects
    #print('"{}"'.format(row1.collection_time))
    collection_time_match = time_re.search(str(int(row1.collection_time)))
    core_attr_plus_data_df.loc[row1.name, 'collection_time'] = datetime.time(
        hour=int(collection_time_match.group('hour')),
        minute=int(collection_time_match.group('minute')))
    
    # copy attributes from previous sample ONLY IF THE ATTRIBUTE IS EMPTY
    column_names = list(core_attr_plus_data_df.columns)
    for attr_name in column_names:
        #print('row {} attr "{}" is "{}"'.format(r3, attr_name, core_attr_plus_data_df.loc[r3, attr_name]))
        if str(core_attr_plus_data_df.loc[r3, attr_name]) in ('nan', 'NaT'):
            #print('  copy "{}" from previous sample'.format(core_attr_plus_data_df.loc[r1, attr_name]))
            core_attr_plus_data_df.loc[r3, attr_name] = core_attr_plus_data_df.loc[r1, attr_name]
        else:
            pass
    
core_attr_plus_data_df