In [None]:
import datetime
from itertools import zip_longest
import os
import re
import sys

import pandas as pd

import muscope

In [None]:
print(sys.modules['muscope'].__file__)
muscope_dp = os.path.dirname(sys.modules['muscope'].__file__)
downloads_dp = os.path.join(muscope_dp, 'downloads')
dyhrman_hl2a_xls_fp = os.path.join(downloads_dp, 'Dyhrman_HL2A_incubation_seq_assoc_data_v3.xlsx')

print(dyhrman_hl2a_xls_fp)
os.path.exists(dyhrman_hl2a_xls_fp)


In [None]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)


In [None]:
core_attr_plus_data_df = pd.read_excel(
    dyhrman_hl2a_xls_fp,
    sheet_name='core attributes + data',
    skiprows=(0,2)
)


# 2 related samples appear in groups of 4 rows
# the first sample is usually "mRNA"
# the second sample is usually "totalRNA"

# the "totalRNA" sample row is missing information that is duplicated
# by its corresponding "mRNA" sample so this function will copy that
# information to the "totalRNA" row

# but there are a few cases of isolated "totalRNA" samples
# that do have all the information already
for (r1, row1), (r2, row2) in grouper(core_attr_plus_data_df.iterrows(), n=2):
    # remove .fastq.tar from sample names
    if str(row1.sample_name).endswith('.fastq.tar'):
        core_attr_plus_data_df.loc[r1, 'sample_name'] = row1.sample_name[:-10]
    else:
        pass
    
    # append .gz to file names
    if str(row1.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r1, 'seq_name'] = row1.seq_name + '.gz'
    else:
        pass
    if str(row2.seq_name).endswith('.fastq'):
        core_attr_plus_data_df.loc[r2, 'seq_name'] = row2.seq_name + '.gz'
    else:
        pass
        
    # copy attributes from previous sample ONLY IF THE ATTRIBUTE IS EMPTY
    # also do not mess with the "gene_name" column
    column_names = list(core_attr_plus_data_df.columns)
    column_names.remove('gene_name')
    for attr_name in column_names:
        #print('row {} attr "{}" is "{}"'.format(r1, attr_name, core_attr_plus_data_df.loc[r1, attr_name]))
        if str(core_attr_plus_data_df.loc[r1, attr_name]) in ('nan', 'NaT'):
            #print('  copy "{}" from previous sample'.format(core_attr_plus_data_df.loc[r1-2, attr_name]))
            core_attr_plus_data_df.loc[r1, attr_name] = core_attr_plus_data_df.loc[r1-2, attr_name]
        else:
            pass
    
core_attr_plus_data_df