In [None]:
### script: fixBrokenFilePaths.ipynb
### author: Lincoln
### date: 1.2.19
###
### want to fix all of these broken s3 file paths in the fastq bin, so that i can properly run my pipelines!

In [1]:
import os
import pandas as pd
pd.options.display.max_colwidth = 500 # module config

In [2]:
prefix = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/'
f = 'cellPrefixes.txt'
! aws s3 ls $prefix > $f
! cat $f

                           PRE A10_1001000296/
                           PRE A10_1001000297/
                           PRE A11_1001000296/
                           PRE A11_1001000297/
                           PRE A13_1001000000/
                           PRE A14_1001000000/
                           PRE A15_1001000000/
                           PRE A16_1001000000/
                           PRE A17_1001000000/
                           PRE A18_1001000000/
                           PRE A19_1001000000/
                           PRE A1_1001000296/
                           PRE A1_1001000299/
                           PRE A20_1001000000/
                           PRE A21_1001000000/
                           PRE A22_1001000000/
                           PRE A23_1001000000/
                           PRE A24_1001000000/
                           PRE A25_1001000000/
                           PRE A26_1001000000/
                           PRE A27_1001000

In [3]:
# read run names into a dataframe
cells_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])
cells_df

Unnamed: 0,is_prefix,cell_name
0,PRE,A10_1001000296/
1,PRE,A10_1001000297/
2,PRE,A11_1001000296/
3,PRE,A11_1001000297/
4,PRE,A13_1001000000/
5,PRE,A14_1001000000/
6,PRE,A15_1001000000/
7,PRE,A16_1001000000/
8,PRE,A17_1001000000/
9,PRE,A18_1001000000/


In [4]:
# add a full_path col
cells_df['full_path'] = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/' + cells_df['cell_name']
cells_df

Unnamed: 0,is_prefix,cell_name,full_path
0,PRE,A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/
1,PRE,A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/
2,PRE,A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/
3,PRE,A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/
4,PRE,A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/
5,PRE,A14_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/
6,PRE,A15_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/
7,PRE,A16_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/
8,PRE,A17_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/
9,PRE,A18_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/


In [5]:
# get full s3 paths for fastq file (R1), then add them to a new col in cells_df

def get_fastqs_R1(cell):
    s3_location = f'{prefix}{cell}'
    lines = ! aws s3 ls $s3_location
    try:
        fq_line = [x for x in lines if x.endswith('R1_001.fastq.gz')][0] # get the fastq files, specifically
        fq_basename = fq_line.split()[-1]
    except IndexError:
        print(s3_location)
        fq_basename = 'dummy'
        #continue # in this case, maybe we dont want to return anything at all...
    return f'{s3_location}{fq_basename}'


cells_df['fastq1_curr'] = cells_df['cell_name'].map(get_fastqs_R1) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,full_path,fastq1_curr
0,PRE,A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R1_001.fastq.gz
1,PRE,A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R1_001.fastq.gz
2,PRE,A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R1_001.fastq.gz
3,PRE,A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R1_001.fastq.gz
4,PRE,A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R1_001.fastq.gz


In [6]:
# get full s3 paths for fastq file (R2), then add them to a new col in cells_df

def get_fastqs_R2(cell):
    s3_location = f'{prefix}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    try:
        fq_line = [x for x in lines if x.endswith('R2_001.fastq.gz')][0] # get the fastq files, specifically
        fq_basename = fq_line.split()[-1]
    except IndexError:
        print(s3_location)
        fq_basename = 'dummy'
        #continue # in this case, maybe we dont want to return anything at all...
    return f'{s3_location}{fq_basename}'


cells_df['fastq2_curr'] = cells_df['cell_name'].map(get_fastqs_R2) # applying function, and assigning output to new col in cells_df
cells_df.head()

Unnamed: 0,is_prefix,cell_name,full_path,fastq1_curr,fastq2_curr
0,PRE,A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R2_001.fastq.gz
1,PRE,A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R2_001.fastq.gz
2,PRE,A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R2_001.fastq.gz
3,PRE,A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R2_001.fastq.gz
4,PRE,A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R2_001.fastq.gz


In [7]:
cells_df['cell_name'] = cells_df['cell_name'].str.strip('/')
cells_df

Unnamed: 0,is_prefix,cell_name,full_path,fastq1_curr,fastq2_curr
0,PRE,A10_1001000296,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R2_001.fastq.gz
1,PRE,A10_1001000297,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R2_001.fastq.gz
2,PRE,A11_1001000296,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R2_001.fastq.gz
3,PRE,A11_1001000297,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R2_001.fastq.gz
4,PRE,A13_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R2_001.fastq.gz
5,PRE,A14_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_S319_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_S319_R2_001.fastq.gz
6,PRE,A15_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_S320_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_S320_R2_001.fastq.gz
7,PRE,A16_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_S321_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_S321_R2_001.fastq.gz
8,PRE,A17_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_S322_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_S322_R2_001.fastq.gz
9,PRE,A18_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_S323_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_S323_R2_001.fastq.gz


In [8]:
# can we set the desired fastq1? 
cells_df['fastq1_dest'] = cells_df['full_path'] + cells_df['cell_name'] + '_R1_001.fastq.gz'
cells_df['fastq2_dest'] = cells_df['full_path'] + cells_df['cell_name'] + '_R2_001.fastq.gz'
cells_df

Unnamed: 0,is_prefix,cell_name,full_path,fastq1_curr,fastq2_curr,fastq1_dest,fastq2_dest
0,PRE,A10_1001000296,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_S10_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000296/A10_1001000296_R2_001.fastq.gz
1,PRE,A10_1001000297,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_S101_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A10_1001000297/A10_1001000297_R2_001.fastq.gz
2,PRE,A11_1001000296,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_S11_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000296/A11_1001000296_R2_001.fastq.gz
3,PRE,A11_1001000297,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_S102_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A11_1001000297/A11_1001000297_R2_001.fastq.gz
4,PRE,A13_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_S318_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A13_1001000000/A13_1001000000_R2_001.fastq.gz
5,PRE,A14_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_S319_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_S319_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A14_1001000000/A14_1001000000_R2_001.fastq.gz
6,PRE,A15_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_S320_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_S320_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A15_1001000000/A15_1001000000_R2_001.fastq.gz
7,PRE,A16_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_S321_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_S321_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A16_1001000000/A16_1001000000_R2_001.fastq.gz
8,PRE,A17_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_S322_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_S322_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A17_1001000000/A17_1001000000_R2_001.fastq.gz
9,PRE,A18_1001000000,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_S323_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_S323_R2_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/170129/A18_1001000000/A18_1001000000_R2_001.fastq.gz


In [10]:
# add a cmd col
cells_df['cmd1'] = 'aws s3 mv ' + cells_df['fastq1_curr'] + ' ' + cells_df['fastq1_dest']
cells_df['cmd2'] = 'aws s3 mv ' + cells_df['fastq2_curr'] + ' ' + cells_df['fastq2_dest']
cells_df

Unnamed: 0,is_prefix,cell_name,full_path,fastq1_curr,fastq2_curr,fastq1_dest,fastq2_dest,cmd1,cmd2
0,PRE,A10_1001000407,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000407/A10_1001000407_R2_001.fastq.gz
1,PRE,A10_1001000408,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000408/A10_1001000408_R2_001.fastq.gz
2,PRE,A10_1001000409,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000409/A10_1001000409_R2_001.fastq.gz
3,PRE,A10_1001000410,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000410/A10_1001000410_R2_001.fastq.gz
4,PRE,A10_1001000412,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_1001000412/A10_1001000412_R2_001.fastq.gz
5,PRE,A10_B001608,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B001608/A10_B001608_R2_001.fastq.gz
6,PRE,A10_B003048,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A10_B003048/A10_B003048_R2_001.fastq.gz
7,PRE,A11_1001000407,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000407/A11_1001000407_R2_001.fastq.gz
8,PRE,A11_1001000408,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000408/A11_1001000408_R2_001.fastq.gz
9,PRE,A11_1001000409,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R1_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R2_001_merged.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R1_001.fastq.gz,s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R2_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R1_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R1_001.fastq.gz,aws s3 mv s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R2_001_merged.fastq.gz s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/171120_concat/A11_1001000409/A11_1001000409_R2_001.fastq.gz


In [11]:
cells_df['cmd1'].to_csv('moveCmds_170129_1.csv', index = False)
cells_df['cmd2'].to_csv('moveCmds_170129_2.csv', index = False)

In [2]:
# get full s3 paths for fastq file (R1), then add them to a new col in cells_df
def get_fastqs_R1(cell):
    s3_location = f'{run}{cell}' #f? 
    lines = ! aws s3 ls $s3_location
    
    fq_line = [x for x in lines if x.endswith('R1_001.fastq.gz')][0] # get the fastq files, specifically
    fq_basename = fq_line.split()[-1]
        
    return f'{s3_location}{fq_basename}'

# get full s3 paths for fastq file (R2), then add them to a new col in cells_df
def get_fastqs_R2(cell):
    s3_location = f'{run}{cell}' #f? 
    lines = ! aws s3 ls $s3_location

    fq_line = [x for x in lines if x.endswith('R2_001.fastq.gz')][0] # get the fastq files, specifically
    fq_basename = fq_line.split()[-1]
        
    return f'{s3_location}{fq_basename}'

In [3]:
# main()
#   lets try and do this for all our problematic runs

prefix = 's3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/'
runsList = ['180307', '180319', '180320', '180405', '180423', '180516', '180519', '180601', '180711', '180831', '180911', '181120']
big_df = pd.DataFrame()

for r in runsList:
    run = prefix + r + '/'
    print(run)
    
    f = 'currPrefixes.txt'
    ! aws s3 ls $run > $f

    # read run names into a dataframe
    cells_df = pd.read_table(f, delim_whitespace=True, header=None, names=['is_prefix', 'cell_name'])
    
    # add a full_path col
    cells_df['full_path'] = run + cells_df['cell_name']
    
    # applying function, and assigning output to new col in cells_df
    cells_df['fastq1_curr'] = cells_df['cell_name'].map(get_fastqs_R1) 

    cells_df['fastq2_curr'] = cells_df['cell_name'].map(get_fastqs_R2) # applying function, and assigning output to new col in cells_df

    # remove '/'
    cells_df['cell_name'] = cells_df['cell_name'].str.strip('/')
    
    # set desired fq paths
    cells_df['fastq1_dest'] = cells_df['full_path'] + cells_df['cell_name'] + '_R1_001.fastq.gz'
    cells_df['fastq2_dest'] = cells_df['full_path'] + cells_df['cell_name'] + '_R2_001.fastq.gz'
    
    # add a cmd col
    cells_df['cmd1'] = 'aws s3 mv ' + cells_df['fastq1_curr'] + ' ' + cells_df['fastq1_dest']
    cells_df['cmd2'] = 'aws s3 mv ' + cells_df['fastq2_curr'] + ' ' + cells_df['fastq2_dest']

    big_df = big_df.append(cells_df)
    
    big_df['cmd1'].to_csv('moveCmds1.csv', index = False) # lets do this every iteration, bc im a bit nervous
    big_df['cmd2'].to_csv('moveCmds2.csv', index = False)

s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180307/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180319/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180320/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180405/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180423/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180516/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180519/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180601/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180711/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180831/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/180911/
s3://darmanis-group/singlecell_lungadeno/non_immune/nonImmune_fastqs_9.27/181120/


IndexError: list index out of range

In [33]:
big_df['cmd1'].to_csv('moveCmds1.csv', index = False)
big_df['cmd2'].to_csv('moveCmds2.csv', index = False)