# FASTQ QC

In [1]:
import pandas as pd
import pathlib

## Read metadata from last step
- The metadata from last step records all sample information, guide me in this step

In [7]:
fastq_meta = pd.read_csv('metadata/fastq_metadata.csv', index_col=0)
fastq_meta

Unnamed: 0_level_0,count_type,experiment_id,bio_sample_id,tissue,replicate,dev_time,file_name
File accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENCFF329ACL,reads,ENCSR160IIN,UBERON:0001890,forebrain,1,E11.5,forebrain_E11.5_1_ENCFF329ACL.fastq.gz
ENCFF251LNG,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,forebrain_E11.5_2_ENCFF251LNG.fastq.gz
ENCFF896COV,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,forebrain_E11.5_2_ENCFF896COV.fastq.gz
ENCFF959PSX,reads,ENCSR970EWM,UBERON:0001890,forebrain,2,E13.5,forebrain_E13.5_2_ENCFF959PSX.fastq.gz
ENCFF235DNM,reads,ENCSR970EWM,UBERON:0001890,forebrain,1,E13.5,forebrain_E13.5_1_ENCFF235DNM.fastq.gz
ENCFF270GKY,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,forebrain_E14.5_1_ENCFF270GKY.fastq.gz
ENCFF460TCF,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,forebrain_E14.5_1_ENCFF460TCF.fastq.gz
ENCFF126IRS,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,forebrain_E14.5_2_ENCFF126IRS.fastq.gz
ENCFF748SRJ,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,forebrain_E14.5_2_ENCFF748SRJ.fastq.gz
ENCFF447EXU,reads,ENCSR362AIZ,UBERON:0001890,forebrain,2,P0,forebrain_P0_2_ENCFF447EXU.fastq.gz


In [14]:
output_dir = pathlib.Path('fastq/trimmed/').absolute()

# another way to make dir, using python (pathlib) but not shell command
output_dir.mkdir(exist_ok=True)

fastq_dir = pathlib.Path('fastq/data/').absolute()

## Run commands in parallel

In [20]:
process = 25

# this is one way to run things in parallel in python
from concurrent.futures import ProcessPoolExecutor, as_completed
import subprocess

In [25]:
# Here I run 25 process in parallel (on a server, not on a laptop, 
# on laptop, this can be 4 or 8 depending on your CPU)

trim_galore_path = '/home/hanliu/pkg/TrimGalore-0.6.5/trim_galore'
with ProcessPoolExecutor(process) as executor:
    #executor is a process pool, it will control the number of processes run in parallel
    
    futures = {}
    for file_name in fastq_meta['file_name']:
        file_path = fastq_dir / file_name
        command = f'{trim_galore_path} {file_path} --fastqc -o {output_dir}'
        
        # executor.submit a job (run a function with certain parameter, here the function is subprocess.run) 
        # and return a future obj. The future obj refer to this job, save it into futures first
        future = executor.submit(subprocess.run, command, shell=True, check=True)
        futures[future] = file_name
    
    # as_complete determine which future is finished
    for future in as_completed(futures):
        # get back the file name associated to this future
        file_name = futures[future]
        
        # this line is important, it check wheter the job finished without error
        # it will also got the job (subprocess.run) return, here that's not important
        _ = future.result()
        
        print(file_name, 'trim finished.')

forebrain_E14.5_2_ENCFF748SRJ.fastq.gz trim finished.
forebrain_E14.5_2_ENCFF126IRS.fastq.gz trim finished.
forebrain_E12.5_1_ENCFF294JRP.fastq.gz trim finished.
forebrain_E11.5_2_ENCFF896COV.fastq.gz trim finished.
forebrain_E11.5_2_ENCFF251LNG.fastq.gz trim finished.
forebrain_E14.5_1_ENCFF270GKY.fastq.gz trim finished.
forebrain_E14.5_1_ENCFF460TCF.fastq.gz trim finished.
forebrain_E12.5_2_ENCFF700OLU.fastq.gz trim finished.
forebrain_P0_2_ENCFF458NWF.fastq.gz trim finished.
forebrain_P0_2_ENCFF447EXU.fastq.gz trim finished.
forebrain_P0_1_ENCFF358MFI.fastq.gz trim finished.
forebrain_P0_1_ENCFF037JQC.fastq.gz trim finished.
forebrain_E10.5_2_ENCFF528EVC.fastq.gz trim finished.
forebrain_E10.5_2_ENCFF663SNC.fastq.gz trim finished.
forebrain_E10.5_1_ENCFF320FJX.fastq.gz trim finished.
forebrain_E10.5_1_ENCFF920CNZ.fastq.gz trim finished.
forebrain_E11.5_1_ENCFF329ACL.fastq.gz trim finished.
forebrain_E16.5_2_ENCFF114DRT.fastq.gz trim finished.
forebrain_E16.5_1_ENCFF931IVO.fastq.gz t

In [None]:
# alternatively, this is how to run command one by one

# trim_galore_path = '/home/hanliu/pkg/TrimGalore-0.6.5/trim_galore'
# for file_name in fastq_meta['file_name']:
#     file_path = fastq_dir / file_name
#     command = f'{trim_galore_path} {file_path} --fastqc -o {output_dir}'
#     subprocess.run(command, shell=True, check=True)
#     print(file_name, 'trim finished.')

## Make metadata for trimmed fastq

In [27]:
# find out all the trimmed fastq, make a dict
# the glob method uses wildcard to find all files whose name match the pattern
fastq_list = list(output_dir.glob('*trimmed.fq.gz'))
fastq_list[:5]

[PosixPath('/home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_E11.5_2_ENCFF251LNG_trimmed.fq.gz'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_E12.5_2_ENCFF700OLU_trimmed.fq.gz'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_E12.5_1_ENCFF920QAY_trimmed.fq.gz'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_P0_2_ENCFF458NWF_trimmed.fq.gz'),
 PosixPath('/home/hanliu/project/genome_book/DevFB/fastq/trimmed/forebrain_P0_1_ENCFF037JQC_trimmed.fq.gz')]

In [30]:
# replace the fastq path with trimmed ones
fastq_series = pd.Series({i.name.split('_')[3]: str(i) for i in fastq_list})
fastq_meta['file_name'] = fastq_series

In [31]:
fastq_meta

Unnamed: 0_level_0,count_type,experiment_id,bio_sample_id,tissue,replicate,dev_time,file_name
File accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENCFF329ACL,reads,ENCSR160IIN,UBERON:0001890,forebrain,1,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF251LNG,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF896COV,reads,ENCSR160IIN,UBERON:0001890,forebrain,2,E11.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF959PSX,reads,ENCSR970EWM,UBERON:0001890,forebrain,2,E13.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF235DNM,reads,ENCSR970EWM,UBERON:0001890,forebrain,1,E13.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF270GKY,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF460TCF,reads,ENCSR185LWM,UBERON:0001890,forebrain,1,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF126IRS,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF748SRJ,reads,ENCSR185LWM,UBERON:0001890,forebrain,2,E14.5,/home/hanliu/project/genome_book/DevFB/fastq/t...
ENCFF447EXU,reads,ENCSR362AIZ,UBERON:0001890,forebrain,2,P0,/home/hanliu/project/genome_book/DevFB/fastq/t...


In [32]:
fastq_meta.to_csv('metadata/trimmed_fastq_metadata.csv')