## Prepare input data

In [1]:
%%bash
cd /workspace/shapeseq
mkdir -p data/fetchngs-input
cat << EOF > data/fetchngs-input/fetchngs-samplesheet.csv
SRR6848182
SRR12235536
SRR12235529
EOF
nextflow run nf-core/fetchngs -profile docker,docker,gitpod --outdir data/fetchngs-output --input data/fetchngs-input/fetchngs-samplesheet.csv
# Rename multiqc_config.yml to avoid conflict in next pipeline
mkdir -p data/shapeseq-input
mv data/fetchngs-output/samplesheet/multiqc_config.yml data/shapeseq-input/samplesheet/custom_multiqc_config.yml

N E X T F L O W  ~  version 22.10.1
Pulling nf-core/fetchngs ...
 downloaded from https://github.com/nf-core/fetchngs.git
Launching `https://github.com/nf-core/fetchngs` [naughty_faggin] DSL2 - revision: 084e5ef303 [master]


------------------------------------------------------
                                        ,--./,-.
        ___     __   __   __   ___     /,-._.--~'
  |\ | |__  __ /  ` /  \ |__) |__         }  {
  | \| |       \__, \__/ |  \ |___     \`-._,-`-,
                                        `._,._,'
  nf-core/fetchngs v1.9-g084e5ef
------------------------------------------------------
Core Nextflow options
  revision       : master
  runName        : naughty_faggin
  containerEngine: docker
  launchDir      : /workspace/shapeseq/notebooks
  workDir        : /workspace/shapeseq/notebooks/work
  projectDir     : /home/gitpod/.nextflow/assets/nf-core/fetchngs
  userName       : gitpod
  profile        : docker
  configFiles    : /home/gitpod/.nextflow/assets/nf-core/

In [8]:
import pandas as pd

samplesheet_df = pd.read_csv("../data/fetchngs-output/samplesheet/samplesheet.csv")
# Set the sample_type to "control" for samples that start with "DMSO" in the sample_title
samplesheet_df.loc[samplesheet_df["sample_title"].str.startswith("DMSO"), "sample_type"] = "control"
# Set the sample_type to "treatment" for all other samples
samplesheet_df.loc[~samplesheet_df["sample_title"].str.startswith("DMSO"), "sample_type"] = "treatment"
# Set the associated_control to SRX8743875 for treatment samples
samplesheet_df.loc[samplesheet_df["sample_type"] == "treatment", "associated_control"] = "SRX8743875"
samplesheet_df.to_csv("../data/shapeseq-input/samplesheet.csv", index=False, quoting=1)
samplesheet_df[["sample","sample_title", "sample_type", "associated_control"]]

Unnamed: 0,sample,sample_title,sample_type,associated_control
0,SRX3803456,DMS-MaPseq on E. coli (Total RNA),treatment,SRX8743875
1,SRX8743875,"DMSO, E. coli, in vivo (SSII)",control,
2,SRX8743882,"2A3, E. coli, in vivo (SSII)",treatment,SRX8743875


## Get Reference Sequences from RNA Central
16s Ecoli rRNA
https://rnacentral.org/api/v1/rna/URS00000ABFE9

23s Ecoli rRNA
https://rnacentral.org/rna/URS00004B0F34


In [4]:
%%bash 
destination=../data/shapeseq-input/reference
mkdir -p $destination
rm -f $destination/*
for rnacentral_id in URS00000ABFE9 URS00004B0F34 
do
    wget -O $destination/${rnacentral_id}.fasta https://rnacentral.org/api/v1/rna/${rnacentral_id}.fasta
    cat $destination/${rnacentral_id}.fasta >> $destination/reference_unsorted.fa
done
awk 'BEGIN{RS=">"} NR>1 {gsub("\n", "\t"); print ">"$0}' $destination/reference_unsorted.fa | \
    LC_ALL=C sort -t ' ' -k 2,2 | \
awk '{sub("\t", "\n"); gsub("\t", ""); print $0}' > $destination/reference_sorted.fa

--2023-04-23 19:40:24--  https://rnacentral.org/api/v1/rna/URS00000ABFE9.fasta
Resolving rnacentral.org (rnacentral.org)... 193.62.193.83
Connecting to rnacentral.org (rnacentral.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1598 (1.6K) [text/fasta]
Saving to: ‘../data/shapeseq-input/reference/URS00000ABFE9.fasta’

     0K .                                                     100% 2.57M=0.001s

2023-04-23 19:40:24 (2.57 MB/s) - ‘../data/shapeseq-input/reference/URS00000ABFE9.fasta’ saved [1598/1598]

--2023-04-23 19:40:24--  https://rnacentral.org/api/v1/rna/URS00004B0F34.fasta
Resolving rnacentral.org (rnacentral.org)... 193.62.193.83
Connecting to rnacentral.org (rnacentral.org)|193.62.193.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2977 (2.9K) [text/fasta]
Saving to: ‘../data/shapeseq-input/reference/URS00004B0F34.fasta’

     0K ..                                                    100% 49.5M=0s

2023-0