Long-Read

In [1]:
import pandas as pd
import glob

# Define the paths for the folders
pb_folder = "/g/data/te53/t2t2024/analyses/rawdataeval/ncig/pb/*stats*.csv"
ont_folder = "/g/data/te53/t2t2024/analyses/rawdataeval/ncig/ont/*stats*.csv"

# Function to process files and add columns
def process_files(files, tech_label):
    data = []
    for file in files:
        df = pd.read_csv(file)
        # Add the 'tech' column
        df['tech'] = tech_label
        # Split the 'Sample' column and extract 'donor' and 'flowcell'
        df[['donor', 'flowcell']] = df['Sample'].str.split('_', expand=True)[[0, 3]]
        data.append(df)
    return data

# Read and process all CSV files from both folders
pb_files = glob.glob(pb_folder)
ont_files = glob.glob(ont_folder)

pb_data = process_files(pb_files, 'pb')
ont_data = process_files(ont_files, 'ont')

# Combine the data from both technologies
df = pd.concat(pb_data + ont_data, ignore_index=True)
df = df[['donor', 'tech', 'flowcell','Total_Bases', 'Total_Reads', 'Average_Read_Length', 'N50','N90']]
df['Average_Read_Length'] = df['Average_Read_Length'].round(2)
df_supplementary = df.copy()
df_maintext = df.copy()

In [2]:
df_maintext = df_maintext.groupby(['donor','tech']).agg(
    Total_Bases=('Total_Bases', 'sum'),
    Total_Reads=('Total_Reads', 'sum'),
    Average_Read_Length=('Average_Read_Length', 'mean'),
    N50=('N50', 'mean'),
    N90=('N90', 'mean')
).reset_index()
df_maintext[['Average_Read_Length', 'N50', 'N90']] = df_maintext[['Average_Read_Length', 'N50', 'N90']].round(2)
df_maintext['Total_Bases'] = (df_maintext['Total_Bases'] / 1e9).round(2)

In [3]:
df_maintext.to_csv("/g/data/te53/t2t2024/analyses/rawdataeval/finalcsv-manuscript/longread-maintext.csv")

In [4]:
df_supplementary  = df_supplementary.sort_values(by=['donor', 'tech']).reset_index(drop=True)
df_supplementary['Total_Bases'] = (df_supplementary['Total_Bases'] / 1e9).round(2)

In [5]:
df_supplementary.to_csv("/g/data/te53/t2t2024/analyses/rawdataeval/finalcsv-manuscript/longread-supplementary.csv")

Short read

In [6]:
import pandas as pd

# Read the TSV file into a DataFrame
df = pd.read_csv("/g/data/te53/t2t2024/analyses/rawdataeval/ncig/illumina/output/multiqc_data/multiqc_fastqc.txt",sep='\t',usecols=range(10))
df = df[['Sample','Total Sequences','avg_sequence_length']]
df['Total_bases'] = ((df['avg_sequence_length'] * df ['Total Sequences'])/1e9)
df[['Total_bases','avg_sequence_length']] = df[['Total_bases','avg_sequence_length']].round(2)

In [7]:
df[['donor', 'uniqueID', 'tubeID', 'lane']] = df['Sample'].str.split('.', expand=True)
# Drop the original 'sample' column
df = df.drop(columns=['Sample'])
df = df[['donor','uniqueID','tubeID','lane','Total Sequences', 'avg_sequence_length', 'Total_bases']]

df_supplementary = df.copy()
df_maintext = df.copy()

In [8]:
df_supplementary = df_supplementary.groupby(['donor', 'uniqueID', 'tubeID']).agg(
    {
        'lane': lambda x: ';'.join(x),
        'Total Sequences': lambda x: ';'.join(map(str, x)),
        'avg_sequence_length': lambda x: ';'.join(map(str, x)),
        'Total_bases': lambda x: ';'.join(map(str, x)),
    }
).reset_index()

In [9]:
df_supplementary[['donor', 'uniqueID','Total Sequences','avg_sequence_length', 'Total_bases']].to_csv("/g/data/te53/t2t2024/analyses/rawdataeval/finalcsv-manuscript/shortread-supplementary.csv")

In [10]:
# Group by donor and lane, then aggregate
aggregated = (
    df.groupby(['donor', 'lane'])
    .agg({
        'Total Sequences': 'sum',
        'avg_sequence_length': 'mean',
        'Total_bases': 'sum'
    })
    .reset_index()
)

aggregated[['Total_bases','avg_sequence_length']] = aggregated[['Total_bases','avg_sequence_length']].round(2)

# Pivot to get R1 and R2 in separate columns
pivoted = aggregated.pivot(index='donor', columns='lane')
pivoted.columns = [f"{col[0]}_{col[1]}" for col in pivoted.columns]

# Combine R1 and R2 into the semicolon-separated structure
pivoted['Total Sequences'] = pivoted['Total Sequences_R1'].astype(str) + ';' + pivoted['Total Sequences_R2'].astype(str)
pivoted['avg_sequence_length'] = pivoted['avg_sequence_length_R1'].astype(str) + ';' + pivoted['avg_sequence_length_R2'].astype(str)
pivoted['Total_bases'] = pivoted['Total_bases_R1'].astype(str) + ';' + pivoted['Total_bases_R2'].astype(str)

# Select final columns
df_maintext = pivoted[['Total Sequences', 'avg_sequence_length', 'Total_bases']].reset_index()


In [11]:
df_maintext.to_csv("/g/data/te53/t2t2024/analyses/rawdataeval/finalcsv-manuscript/shortread-maintext.csv")

In [12]:
# Define the paths for the folders
pb_folder = "/g/data/te53/t2t2024/analyses/rawdataeval/ncig/pb/*quality*.csv"
ont_folder = "/g/data/te53/t2t2024/analyses/rawdataeval/ncig/ont/*quality*.csv"

# Function to process files and add columns
def process_files(files, tech_label):
    data = []
    for file in files:
        df = pd.read_csv(file)
        # Add the 'tech' column
        df['tech'] = tech_label
        # Split the 'Sample' column and extract 'donor' and 'flowcell'
        df[['donor', 'flowcell']] = df['Sample'].str.split('_', expand=True)[[0, 3]]
        data.append(df)
    return data


In [13]:
# Read and process all CSV files from both folders
pb_files = glob.glob(pb_folder)
ont_files = glob.glob(ont_folder)

pb_data = process_files(pb_files, 'pb')
ont_data = process_files(ont_files, 'ont')

In [15]:
df = pd.concat(pb_data + ont_data, ignore_index=True)

In [18]:
df = df[[ 'donor','flowcell','Read_Length', 'QV', 'Read_Numbers', 'tech']]

In [26]:
df.groupby(['donor','flowcell'])['QV'].mean().reset_index()

Unnamed: 0,donor,flowcell,QV
0,N002580,DA121306,53.112055
1,N002580,DA121307,53.030053
2,N002580,DA121328,52.947740
3,N002580,DA136084,53.325429
4,N002580,DA173513,53.713633
...,...,...,...
71,N008294,PTXXXX230364,25.679376
72,N008294,RGBX230026,32.775923
73,N008294,RGBX230033,33.073733
74,N008294,RGBX230091,33.956005


In [28]:
overall_min_max_qv_by_tech = df.groupby('tech')['QV'].agg(['min', 'max']).reset_index()

In [29]:
overall_min_max_qv_by_tech

Unnamed: 0,tech,min,max
0,ont,0,50
1,pb,21,93


In [30]:
df.groupby('tech')['QV'].mean().reset_index()

Unnamed: 0,tech,QV
0,ont,22.503957
1,pb,49.986322
