# Salmon Mapping

## Prepare mapping commands


In [1]:
import pandas as pd
import pathlib

In [2]:
fastq_meta = pd.read_csv('./metadata/trimmed_fastq_metadata.csv', index_col=0)
fastq_meta

Unnamed: 0_level_0,count_type,experiment_id,bio_sample_id,tissue,replicate,dev_time,file_name
File accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENCFF329ACL,reads,ENCSR160IIN,UBERON:0001890,forebrain,1,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF251LNG,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF896COV,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF959PSX,reads,ENCSR970EWM,UBERON:0001890,forebrain,2,E13.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF235DNM,reads,ENCSR970EWM,UBERON:0001890,forebrain,1,E13.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF270GKY,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF460TCF,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF126IRS,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF748SRJ,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF447EXU,reads,ENCSR362AIZ,UBERON:0001890,forebrain,2,P0,/home/hanliu/project/genome_book/DevFB/fastq/t...


## Prepare salmon command for each sample

In [3]:
# output dir
output_dir = pathlib.Path('quant/').absolute()
output_dir.mkdir(exist_ok=True)

# set all the directories
index_dir = pathlib.Path('../ref/Salmon/salmon_index/').absolute()

In [16]:
# I use my own server, change this number to 4 if using laptop
# also, because salmon run in parallel internally, we just run salmon commands one by one
threads = 45

In [20]:
# make command for each RNA-seq sample based on the metadata
commands = {}
for (tissue, time, rep), sub_df in fastq_meta.groupby(['tissue', 'dev_time', 'replicate']):
    fastq_paths_str = ' '.join(sub_df['file_name'])
    output_name = output_dir / f'{tissue}_{time}_{rep}.quant'
    
    # assemble the final command
    command = f'salmon quant -i {index_dir} -l A -r {fastq_paths_str} --threads {threads} --validateMappings -o {output_name}'
    commands[f'{tissue}_{time}_{rep}'] = command

In [21]:
# a example command
command

'salmon quant -i /home/hanliu/project/genome_book/DevFB/../ref/Salmon/salmon_index -l A -r /home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_P0_2_ENCFF447EXU_trimmed.fq.gz /home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_P0_2_ENCFF458NWF_trimmed.fq.gz --threads 45 --validateMappings -o /home/hanliu/project/genome_book/DevFB/quant/forebrain_P0_2.quant'

## Run salmon

In [23]:
import subprocess
for name, command in commands.items():
    # once command is finished, you may want to keep a physical record, so you know its finished for sure
    # you can also use this physical to prevent rerun the command, if the execution stopped in some place
    if pathlib.Path(output_dir / name).exists():
        print('EXISTS', name)
        continue
    
    subprocess.run(command, shell=True, check=True, 
                   stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8')
    
    
    print('FINISH', name)
    with open(output_dir / name, 'w') as f:
        f.write('Oh Yeah')
    

EXISTS forebrain_E10.5_1
EXISTS forebrain_E10.5_2
FINISH forebrain_E11.5_1
FINISH forebrain_E11.5_2
FINISH forebrain_E12.5_1
FINISH forebrain_E12.5_2
FINISH forebrain_E13.5_1
FINISH forebrain_E13.5_2
FINISH forebrain_E14.5_1
FINISH forebrain_E14.5_2
FINISH forebrain_E15.5_1
FINISH forebrain_E15.5_2
FINISH forebrain_E16.5_1
FINISH forebrain_E16.5_2
FINISH forebrain_P0_1
FINISH forebrain_P0_2


## Clean up the flag

In [25]:
for name in commands.keys():
    subprocess.run(f'rm {output_dir / name}', shell=True)

## Make a metadata for salmon output

In [4]:
# find out all the trimmed fastq, make a dict
fastq_list = list(output_dir.glob('**/quant.sf'))
fastq_list[:5]

[PosixPath('/home/hanliu/project/genome_book/DevFB/quant/forebrain_E10.5_1.quant/quant.sf'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/quant/forebrain_E10.5_2.quant/quant.sf'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/quant/forebrain_E11.5_1.quant/quant.sf'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/quant/forebrain_E11.5_2.quant/quant.sf'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/quant/forebrain_E12.5_1.quant/quant.sf')]

In [32]:
pd.read_csv(fastq_list[0], nrows=10, sep='\t', index_col=0)

Unnamed: 0_level_0,Length,EffectiveLength,TPM,NumReads
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUST00000193812.1,1070,821.0,0.0,0.0
ENSMUST00000082908.1,110,4.749,0.0,0.0
ENSMUST00000162897.1,4153,3904.0,0.038115,5.689
ENSMUST00000159265.1,2989,2740.0,0.0316,3.311
ENSMUST00000070533.4,3634,3385.0,0.0,0.0
ENSMUST00000192857.1,480,231.0,0.0,0.0
ENSMUST00000195335.1,2819,2570.0,0.173002,17.0
ENSMUST00000192336.1,2233,1984.0,0.448201,34.0
ENSMUST00000194099.1,2309,2060.0,0.203137,16.0
ENSMUST00000161581.1,250,20.633,0.0,0.0


In [28]:
records = []
for path in fastq_list:
    tissue, time, rep = path.parent.name.split('_')
    records.append([tissue, time, rep, str(path)])

salmon_metadata = pd.DataFrame(records, columns=['tissue', 'dev_time', 'replicate', 'salmon_count_path'])
salmon_metadata.to_csv('metadata/salmon_metadata.csv')