In [30]:
import os
import pandas as pd

local_EGAD_fastq_folder = "../../data/public_data_sets/EGAD00001006441/RNAseq_data"
linstat_EGAD_fastq_folder = "/project/rnaseq/EGAD00001006441"
rnaseq_samplesheet_path = "../../data/public_data_sets/EGAD00001006441/rnaseq_samplesheet.csv"

## Create RNA-seq sample sheet

In [11]:
fastq_file_info = pd.DataFrame(columns=["sample_name", "lane_number", "read_number", "relative_path", "absolute_path"])

# Loop over the folders inside EGAD_fastq_folder
for sub_folder in os.listdir(local_EGAD_fastq_folder):
    # Check if the folder is a directory
    if os.path.isdir(os.path.join(local_EGAD_fastq_folder, sub_folder)):
        # Loop over the files inside the folder
        for file in os.listdir(os.path.join(local_EGAD_fastq_folder, sub_folder)):
            # Check if the file is a fastq file
            if file.endswith(".fastq.gz"):
                rel_path = os.path.join(local_EGAD_fastq_folder, sub_folder, file)
                abs_path = os.path.join(linstat_EGAD_fastq_folder, sub_folder, file)

                # Parse sample info out of filename
                # Format of filename: samplename-Carey_lanenumber_readnumber_001.fastq.gz
                filename = file.split(".")[0]
                fastq_file_info.loc[len(fastq_file_info)] = {
                    "sample_name": filename.split("-")[0],
                    "lane_number": filename.split("_")[2],
                    "read_number": filename.split("_")[3],
                    "relative_path": rel_path,
                    "absolute_path": abs_path
                }
fastq_file_info = fastq_file_info.sort_values(by=["sample_name", "lane_number", "read_number"])
display(fastq_file_info)

Unnamed: 0,sample_name,lane_number,read_number,relative_path,absolute_path
33,CL1,L001,R1,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471980...
13,CL1,L001,R2,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471980...
20,CL1,L002,R1,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471980...
51,CL1,L002,R2,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471980...
17,CL1,L003,R1,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471980...
...,...,...,...,...,...
38,CL7,L002,R2,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471983...
58,CL7,L003,R1,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471984...
37,CL7,L003,R2,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471984...
8,CL7,L004,R1,../../data/public_data_sets/EGAD00001006441/RN...,/project/rnaseq/EGAD00001006441/EGAF0000471984...


In [31]:
# Validate that each unique (sample_name, lane_number) pair has exactly 2 reads
grouped = fastq_file_info.groupby(['sample_name', 'lane_number'])['read_number'].apply(set)
invalid_groups = grouped[grouped.apply(lambda x: x != {"R1", "R2"})]
assert len(invalid_groups) == 0, f"The following (sample_name, lane_number) pairs do not have exactly one 'R1' and one 'R2': {invalid_groups}"

# Reshape the dataframe to have one row per (sample_lane_number) pair
pivot_df = fastq_file_info.pivot(index=['sample_name', 'lane_number'], columns='read_number', values='absolute_path')
pivot_df.reset_index(inplace=True)

# Format rnaseq samplesheet
rnaseq_samplesheet = pd.DataFrame({
    "sample": pivot_df["sample_name"],
    "fastq_1": pivot_df["R1"],
    "fastq_2": pivot_df["R2"],
    "strandedness": "auto"
})

display(rnaseq_samplesheet)

# Write rnaseq samplesheet to file
rnaseq_samplesheet.to_csv(rnaseq_samplesheet_path, index=False)


Unnamed: 0,sample,fastq_1,fastq_2,strandedness
0,CL1,/project/rnaseq/EGAD00001006441/EGAF0000471980...,/project/rnaseq/EGAD00001006441/EGAF0000471980...,auto
1,CL1,/project/rnaseq/EGAD00001006441/EGAF0000471980...,/project/rnaseq/EGAD00001006441/EGAF0000471980...,auto
2,CL1,/project/rnaseq/EGAD00001006441/EGAF0000471980...,/project/rnaseq/EGAD00001006441/EGAF0000471980...,auto
3,CL1,/project/rnaseq/EGAD00001006441/EGAF0000471981...,/project/rnaseq/EGAD00001006441/EGAF0000471981...,auto
4,CL10,/project/rnaseq/EGAD00001006441/EGAF0000471978...,/project/rnaseq/EGAD00001006441/EGAF0000471978...,auto
5,CL10,/project/rnaseq/EGAD00001006441/EGAF0000471978...,/project/rnaseq/EGAD00001006441/EGAF0000471978...,auto
6,CL10,/project/rnaseq/EGAD00001006441/EGAF0000471978...,/project/rnaseq/EGAD00001006441/EGAF0000471978...,auto
7,CL10,/project/rnaseq/EGAD00001006441/EGAF0000471978...,/project/rnaseq/EGAD00001006441/EGAF0000471978...,auto
8,CL14,/project/rnaseq/EGAD00001006441/EGAF0000471978...,/project/rnaseq/EGAD00001006441/EGAF0000471978...,auto
9,CL14,/project/rnaseq/EGAD00001006441/EGAF0000471979...,/project/rnaseq/EGAD00001006441/EGAF0000471979...,auto


## Run rnaseq pipeline

```{sh}
./nextflow run nf-core/rnaseq \
    --input /project9/rnaseq/EGAD00001006441/rnaseq_samplesheet.csv \
    --outdir EGAD00001006441 \
    -N -bg \
    --igenomes_base /project/rnaseq/ngi-igenomes/igenomes \
    --genome GRCh38
```