In [1]:
import os
import pandas as pd
pd.options.display.max_colwidth = 500 # module config

In [7]:
prefix = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/'
f = 'cellPrefixes.txt'
! aws s3 ls $cellPrefixes > $f
! cat $f

                           PRE A1_B001800/
                           PRE A21_1001000366/
                           PRE A21_B003049/
                           PRE A2_B001797/
                           PRE A2_B001798/
                           PRE A2_B001799/
                           PRE A2_B003785/
                           PRE A3_B001798/
                           PRE A4_1001000362/
                           PRE A4_B001798/
                           PRE A4_B003785/
                           PRE A5_1001000365/
                           PRE A5_B001798/
                           PRE A5_B001799/
                           PRE A5_B001800/
                           PRE A6_1001000377/
                           PRE B21_B003049/
                           PRE B2_B001797/
                           PRE B2_B001798/
                           PRE B2_B001799/
                           PRE B3_B001797/
                           PRE B3_B001798/
                 

In [8]:
# read run names into a dataframe
cells_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])
cells_df

Unnamed: 0,is_prefix,cell_name
0,PRE,A1_B001800/
1,PRE,A21_1001000366/
2,PRE,A21_B003049/
3,PRE,A2_B001797/
4,PRE,A2_B001798/
5,PRE,A2_B001799/
6,PRE,A2_B003785/
7,PRE,A3_B001798/
8,PRE,A4_1001000362/
9,PRE,A4_B001798/


In [9]:
# add a full_path col? 
cells_df['full_path'] = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/' + cells_df['cell_name']
cells_df

Unnamed: 0,is_prefix,cell_name,full_path
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A1_B001800/
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_1001000366/
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_B003049/
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001797/
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001798/
5,PRE,A2_B001799/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001799/
6,PRE,A2_B003785/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B003785/
7,PRE,A3_B001798/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A3_B001798/
8,PRE,A4_1001000362/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A4_1001000362/
9,PRE,A4_B001798/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A4_B001798/


In [10]:
# get full s3 paths for fastq file (R1), then add them to a new col in cells_df

def get_fastqs_R1(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    fq_line = [x for x in lines if x.endswith('R1_001.fastq.gz')][0] # get the fastq files, specifically
    fq_basename = fq_line.split()[-1]
    return f'{s3_location}{fq_basename}'


cells_df['fastq_1'] = cells_df['cell_name'].map(get_fastqs_R1) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,full_path,fastq_1
0,PRE,A1_B001800/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A1_B001800/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A1_B001800/A1_B001800_A1_B001800_S277_R1_001.fastq.gz
1,PRE,A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_1001000366/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_1001000366/A21_1001000366_A21_1001000366_S57_R1_001.fastq.gz
2,PRE,A21_B003049/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_B003049/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A21_B003049/A21_B003049_A21_B003049_S81_R1_001.fastq.gz
3,PRE,A2_B001797/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001797/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001797/A2_B001797_A2_B001797_S26_R1_001.fastq.gz
4,PRE,A2_B001798/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001798/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180226/A2_B001798/A2_B001798_A2_B001798_S110_R1_001.fastq.gz


In [None]:
# get full s3 paths for fastq file (R2), then add them to a new col in cells_df

def get_fastqs_R2(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    fq_line = [x for x in lines if x.endswith('R2_001.fastq.gz')][0] # get the fastq files, specifically
    fq_basename = fq_line.split()[-1]
    return f'{s3_location}{fq_basename}'


cells_df['fastq_2'] = cells_df['cell_name'].map(get_fastqs_R2) # applying function, and assigning output to new col in cells_df
cells_df.head()