# Illumina InterOp Report for run {{ RUN_ID }}
* **Notebook version:** v0.0.3
* **Created by:** NIHR Imperial BRC Genomics Facility
* **Maintained by:** NIHR Imperial BRC Genomics Facility
* **Docker image path:** [Dockerfile](https://github.com/imperial-genomics-facility/igf-dockerfiles/tree/main/illumina-interop/Dockerfile_v1)
* **Notebook code path:** [Templates](https://github.com/imperial-genomics-facility/igf-dockerfiles/tree/main/illumina-interop/templates)
* **Created on:** {{ DATE_TAG }}
* **Contact us:** [NIHR Imperial BRC Genomics Facility - Contact us](https://www.imperial.ac.uk/medicine/research-and-impact/facilities/genomics-facility/contact-us/)
* **License:** Apache [License 2.0](https://github.com/imperial-genomics-facility/igf-dockerfiles/blob/main/LICENSE)

## Code source
This notebook was developed using codes from the following sources:
  * [Illumina InterOp](http://illumina.github.io/interop/index.html)

Send us your suggestions (or PRs) about how to improve this notebook.

Please add the following statement in all publications if you use any part of this notebook for your analysis: _“The NIHR Imperial BRC Genomics Facility has provided resources and support that have contributed to the research results reported within this paper.”._

## Table of contents

  * [Run information](#Run-information)
    * [Run overview](#Run-overview)
    * [Run stats for each read groups](#Run-stats-for-each-read-groups)
    * [Run stats for each lanes](#Run-stats-for-each-lanes)
  * [Plots](#Plots)
    * [Tile plots](#Tile-plots)
    * [Intensity plots](#Intensity-plots)
    * [Box plots for % ClusterCountPF and % DensityPF](#Box-plots-for---ClusterCountPF-and---DensityPF)
    * [PCA plots](#PCA-plots)
    * [Scatter plot for % Occupied vs % PF](#Scatter-plot-for---Occupied-vs---PF)

In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
import re, os
import numpy as np
import pandas as pd
from typing import (Tuple, Iterator)
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.types import (LongType, DoubleType)
from pyspark.sql import SparkSession
from pyspark.sql.dataframe import DataFrame
import pyspark.sql.functions as sf
from pyspark.sql.functions import col, lit
from IPython.display import HTML
from sklearn.decomposition import PCA
import altair as alt
alt.renderers.enable("html")

RendererRegistry.enable('html')

In [None]:
%%bash
##
## creating InterOp metrics
##
set -eo pipefail

METRICS_DIR={{ METRICS_DIR }}
RUN_DIR={{ RUN_DIR }}
RUN_ID={{ RUN_ID }}

mkdir -p ${METRICS_DIR}

if [[ -e ${RUN_DIR}/InterOp/CorrectedIntMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=CorrectedInt > ${METRICS_DIR}/${RUN_ID}_CorrectedInt.csv
fi
if [[ -e ${RUN_DIR}/InterOp/ExtractionMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=Extraction > ${METRICS_DIR}/${RUN_ID}_Extraction.csv
fi
if [[ -e ${RUN_DIR}/InterOp/QMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=Q > ${METRICS_DIR}/${RUN_ID}_Q.csv
fi
if [[ -e ${RUN_DIR}/InterOp/TileMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=Tile > ${METRICS_DIR}/${RUN_ID}_Tile.csv
fi
if [[ -e ${RUN_DIR}/InterOp/ImageMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=Image > ${METRICS_DIR}/${RUN_ID}_Image.csv
fi
if [[ -e ${RUN_DIR}/InterOp/QMetricsByLaneOut.bin ]];then
  dumptext ${RUN_DIR} --metric=QByLane > ${METRICS_DIR}/${RUN_ID}_QByLane.csv
fi
if [[ -e ${RUN_DIR}/InterOp/QMetricsByLaneOut.bin ]];then
  dumptext ${RUN_DIR} --metric=QCollapsed > ${METRICS_DIR}/${RUN_ID}_QCollapsed.csv
fi
if [[ -e ${RUN_DIR}/InterOp/EmpiricalPhasingMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=EmpiricalPhasing > ${METRICS_DIR}/${RUN_ID}_EmpiricalPhasing.csv
fi
if [[ -e ${RUN_DIR}/InterOp/ExtendedTileMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=ExtendedTile > ${METRICS_DIR}/${RUN_ID}_ExtendedTile.csv
fi
if [[ -e ${RUN_DIR}/InterOp/ErrorMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=Error > ${METRICS_DIR}/${RUN_ID}_Error.csv
fi
if [[ -e ${RUN_DIR}/InterOp/SummaryRunMetricsOut.bin ]];then
  dumptext ${RUN_DIR} --metric=SummaryRun > ${METRICS_DIR}/${RUN_ID}_SummaryRun.csv
fi

In [2]:
## collection interop metrics
metrics_list = list()
interop_metrics_path = "{{ METRICS_DIR }}"
runinfo_xml = os.path.join("{{ RUN_DIR }}", "RunInfo.xml")
runparameters_xml = os.path.join("{{ RUN_DIR }}", "RunParameters.xml")
overview_csv_output = "{{ OVERVIEW_CSV_OUTPUT }}"
tile_parquet_output = "{{ TILE_PARQUET_OUTOUT }}"

for i in os.listdir(interop_metrics_path):
    if i.endswith("_CorrectedInt.csv"):
        metrics_list.append(
            {'CorrectedInt': os.path.join(interop_metrics_path, i)})
    if i.endswith("_Error.csv"):
        metrics_list.append(
            {'Error': os.path.join(interop_metrics_path, i)})
    if i.endswith("_Extraction.csv"):
        metrics_list.append(
            {'Extraction': os.path.join(interop_metrics_path, i)})
    if i.endswith("_QByLane.csv"):
        metrics_list.append(
            {'QByLane': os.path.join(interop_metrics_path, i)})
    if i.endswith("_Q.csv"):
        metrics_list.append(
            {'Q': os.path.join(interop_metrics_path, i)})
    if i.endswith("_Tile.csv"):
        metrics_list.append(
            {'Tile': os.path.join(interop_metrics_path, i)})
    if i.endswith("_EmpiricalPhasing.csv"):
        metrics_list.append(
            {'EmpiricalPhasing': os.path.join(interop_metrics_path, i)})
    if i.endswith("_Image.csv"):
        metrics_list.append(
            {'Image': os.path.join(interop_metrics_path, i)})
    if i.endswith("_QCollapsed.csv"):
        metrics_list.append(
            {'QCollapsed': os.path.join(interop_metrics_path, i)})
    if i.endswith("_ExtendedTile.csv"):
        metrics_list.append(
            {'ExtendedTile': os.path.join(interop_metrics_path, i)})

In [None]:
conf = SparkConf()
conf = \
    conf.\
    setMaster("local[{{ NUM_CPU }}]").\
    setAppName("InterOpReport").\
    set("spark.log.level", "OFF").\
    set("spark.driver.extraJavaOptions", "-Dlog4j.logger.org=OFF").\
    set("spark.sql.execution.arrow.pyspark.enabled", "true").\
    set("spark.executor.memory", "{{ RAM_GB }}g").\
    set("spark.executor.cores", "{{ NUM_CPU }}")
sc = SparkContext(conf=conf)
spark = \
    SparkSession(sc).\
    builder.\
    getOrCreate()

In [3]:
def read_runinfo_xml(runInfoXml_path: str) -> Tuple[str, str, str, int, pd.DataFrame, int]:
  """
  A function for reading RunInfo.xml file from Illumina sequencing run and returns data as Pandas DataFrame

  :param runInfoXml_path: Filepath for RunInfo.xml
  :returns: Run_id, Flowcell_id, Instrument_id, Lane count, a Pandas dataframe containing the run configuration data and total cycles
  """
  try:
    if not os.path.exists(runInfoXml_path):
      raise IOError(
        f'File {runInfoXml_path} not found')
    run_id_pattern = \
      re.compile(
        r'<Run Id=\"(\S+)\"')
    flowcell_pattern = \
      re.compile(
        r'<Flowcell>(\S+)\</Flowcell>')
    instrument_pattern = \
      re.compile(
        r'<Instrument>(\S+)</Instrument>')
    read_pattern1 = \
      re.compile(
        r'<Read Number=\"(\d)\" NumCycles=\"(\d+)\" IsIndexedRead=\"(Y|N)\"')
    read_pattern2 = \
      re.compile(
        r'<Read NumCycles=\"(\d+)\" Number=\"(\d)\" IsIndexedRead=\"(Y|N)\"')
    lane_pattern = \
        re.compile(r'<FlowcellLayout LaneCount=\"(\d)\"')
    read_info = list()
    with open(runInfoXml_path,'r') as fp:
      for line in fp:
        line = line.strip()
        if line.startswith('<Read ') or \
           line.startswith('<Run Id') or \
           line.startswith('<Flowcell') or \
           line.startswith('<<FlowcellLayout') or \
           line.startswith('<Instrument'):
          read_info.append(line)
    run_id = ''
    flowcell_id = ''
    instrument_id = ''
    lane_count = 0
    reads_stat = list()
    read_start = 0
    total_cycles = 0
    for i in read_info:
      run_id_match = \
        re.match(run_id_pattern, i)
      if run_id_match:
        (run_id,) = \
          run_id_match.groups()
      flowcell_id_match = \
        re.match(flowcell_pattern, i)
      if flowcell_id_match:
        (flowcell_id,) = \
          flowcell_id_match.groups()
      instrument_id_match = \
        re.match(instrument_pattern, i)
      if instrument_id_match:
        (instrument_id,) = \
          instrument_id_match.groups()
      lane_match = \
        re.match(lane_pattern, i)
      if lane_match:
        (lane_count,) = \
          lane_match.groups()
      read_match1 = \
        re.match(read_pattern1, i)
      if read_match1:
        read_number, numcycle, index_read = \
          read_match1.groups()
      else:
        read_match2 = \
          re.match(read_pattern2, i)
      if read_match2:
        numcycle, read_number, index_read = \
          read_match2.groups()
      if read_match1 or \
         read_match2:
        reads_stat.append({
          'read_id': int(read_number),
          'cycles': int(numcycle),
          'start_cycle': int(read_start)+1,
          'finish_cycle': int(read_start)+int(numcycle),
          'index_read': index_read})
        read_start += int(numcycle)
        total_cycles += int(numcycle)
    reads_stat = pd.DataFrame(reads_stat)
    if 'read_id' in reads_stat.columns:
      reads_stat['read_id'] = \
        reads_stat['read_id'].astype(int)
    return run_id, flowcell_id, instrument_id, int(lane_count), reads_stat, total_cycles
  except Exception as e:
    raise ValueError(
      f'Failed to read RunInfo.xml for sequencing run, error: {e}')

In [38]:
def read_runparameters_xml(runparametersXml_path: str) -> str:
    """
    A function for reading RunParameters.xml file from Illumina sequencing run and returns flowcell info

    :param runparametersXml_path: Filepath for RunParameters.xml
    :returns: FlowCellMode
    """
    try:
        if not os.path.exists(runparametersXml_path):
            raise IOError(
                f'File {runparametersXml_path} not found')
        flowcell_mode_pattern = \
            re.compile(
                r'\s+?<FlowCellMode>(.*)</FlowCellMode>')
        flowcell_mode = ''
        with open(runparametersXml_path, 'r') as fp:
            for line in fp:
                if re.match(flowcell_mode_pattern, line):
                    flowcell_mode = \
                        re.match(flowcell_mode_pattern, line).group(1)
        return flowcell_mode
    except Exception as e:
        raise ValueError(
            f'Failed to read RunParameters.xml for sequencing run, error: {e}')

In [None]:
def get_interop_data_frames(metrics_list: list[dict]) -> dict:
    try:
        df_dict = dict()
        for entry in metrics_list:
            for metrics_name, metrics_path in entry.items():
                ## read text file
                text_data = sc.textFile(metrics_path)
                ## get headers
                headers = list()
                if metrics_name == 'Q' or \
                   metrics_name == 'QByLane':
                    headers = \
                        text_data.\
                        filter(lambda line: not line.startswith('#')).\
                        map(lambda line: line.split(",")).\
                        filter(lambda lines: len(lines) > 3 ).\
                        first()
                else:
                    headers = \
                        text_data.\
                        filter(lambda line: not line.startswith('#')).\
                        map(lambda line: line.split(",")).\
                        first()
                ## check if headers are present
                if len(headers) == 0:
                    raise ValueError(f"No headers found for {metrics_name}")
                headers = [c.replace('(', '_').replace(')', '').replace(' ', '_') for c in headers]
                header_schema = list()
                header_schema = [f"{c} STRING" for c in headers]
                schema = ','.join(header_schema)
                ## read_data as rdd
                rdd = \
                    text_data.\
                    filter(lambda line: not line.startswith('#')).\
                    filter(lambda line: not line.startswith('Lane')).\
                    map(lambda line: line.split(",")).\
                    filter(lambda lines: len(lines) == len(headers))
                ## convert rdd to df
                df = \
                    spark.createDataFrame(rdd, schema=schema)
                df_dict.update({metrics_name: df})
        return df_dict
    except Exception as e:
        raise ValueError(e)

In [None]:
def get_read_cycles(reads_stat: pd.DataFrame) -> list:
    try:
        read_list = list()
        for row in reads_stat.to_dict(orient='records'):
            read_id = row.get('read_id')
            index_read = row.get('index_read')
            start_cycle = row.get('start_cycle')
            finish_cycle = row.get('finish_cycle')
            if index_read == 'N':
                read_list.append([start_cycle, finish_cycle])
        return read_list
    except Exception as e:
        raise ValueError(e)

In [None]:
reads_stat = \
    pd.DataFrame([
        {'read_id': 1, 'cycles': 28, 'start_cycle': 1, 'finish_cycle': 28, 'index_read': 'N'},
        {'read_id': 2, 'cycles': 10, 'start_cycle': 29, 'finish_cycle': 38, 'index_read': 'Y'},
        {'read_id': 3, 'cycles': 10, 'start_cycle': 39, 'finish_cycle': 48, 'index_read': 'Y'},
        {'read_id': 4, 'cycles': 90, 'start_cycle': 49, 'finish_cycle': 138, 'index_read': 'N'}])
read_cycles = \
    get_read_cycles(reads_stat=reads_stat)
assert len(read_cycles) == 2
assert len(read_cycles[0]) == 2
assert read_cycles[0][0] == 1
assert read_cycles[0][1] == 28
assert read_cycles[1][0] == 49
assert read_cycles[1][1] == 138

In [None]:
def get_mean_callout_intensities_per_tile(
        correctedIntDF: DataFrame,
        reads_stat: pd.DataFrame) -> DataFrame:
    try:
        read_list = \
            get_read_cycles(reads_stat=reads_stat)
        ## fill na
        correctedIntDF = correctedIntDF.na.fill('0')
        correctedIntDF = correctedIntDF.na.replace('nan', '0')
        if len(read_list) == 2:
            correctedIntTileDF = \
                correctedIntDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(CalledCount_A as long)",
                    "cast(CalledCount_C as long)",
                    "cast(CalledCount_G as long)",
                    "cast(CalledCount_T as long)").\
                where(
                    ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))|\
                    ((col("Cycle") >= read_list[1][0])&(col("Cycle") < read_list[1][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.mean(col("CalledCount_A")).alias("mean_CalledCount_A"),
                    sf.mean(col("CalledCount_T")).alias("mean_CalledCount_T"),
                    sf.mean(col("CalledCount_G")).alias("mean_CalledCount_G"),
                    sf.mean(col("CalledCount_C")).alias("mean_CalledCount_C"))
        elif len(read_list) == 1:
            correctedIntTileDF = \
                correctedIntDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(CalledCount_A as long)",
                    "cast(CalledCount_C as long)",
                    "cast(CalledCount_G as long)",
                    "cast(CalledCount_T as long)").\
                where(
                    ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.mean(col("CalledCount_A")).alias("mean_CalledCount_A"),
                    sf.mean(col("CalledCount_T")).alias("mean_CalledCount_T"),
                    sf.mean(col("CalledCount_G")).alias("mean_CalledCount_G"),
                    sf.mean(col("CalledCount_C")).alias("mean_CalledCount_C"))
        else:
            raise ValueError(
                f"Read cycles are unknown: {len(read_list)}")
        return correctedIntTileDF
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': '1', 'Tile': '1101', 'Cycle': '1', 'CalledCount_A': '1981', 'CalledCount_T': '2434', 'CalledCount_G': '1987', 'CalledCount_C': '1945'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '2', 'CalledCount_A': '2050', 'CalledCount_T': '2275', 'CalledCount_G': '1990', 'CalledCount_C': '2032'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '3', 'CalledCount_A': '2026', 'CalledCount_T': '2159', 'CalledCount_G': '2014', 'CalledCount_C': '2148'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '4', 'CalledCount_A': '2106', 'CalledCount_T': '2009', 'CalledCount_G': '2030', 'CalledCount_C': '2202'}]))
t = get_mean_callout_intensities_per_tile(t_df, reads_stat)
t_pdf = t.toPandas()
assert len(t_pdf.index) == 1
assert t_pdf['mean_CalledCount_A'].values[0] == 2040.75
assert t_pdf['mean_CalledCount_T'].values[0] == 2219.25
assert t_pdf['mean_CalledCount_G'].values[0] == 2005.25
assert t_pdf['mean_CalledCount_C'].values[0] == 2081.75

In [None]:
def get_mean_error_rate_per_tile(
        errorDF: DataFrame,
        reads_stat: pd.DataFrame) -> DataFrame:
    try:
        read_list = \
            get_read_cycles(reads_stat=reads_stat)
        ## fill na
        errorDF = errorDF.na.fill('0')
        errorDF = errorDF.na.replace('nan', '0')
        if len(read_list) == 2:
            errorTileDF = \
                errorDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(ErrorRate as float)").\
                    where(
                        ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))|\
                        ((col("Cycle") >= read_list[1][0])&(col("Cycle") < read_list[1][1]))).\
                    groupBy("Lane", "Tile").\
                    agg(
                        sf.mean(col("ErrorRate")).alias("mean_ErrorRate")).\
                    orderBy("Lane", "Tile")
        elif len(read_list) == 1:
            errorTileDF = \
                errorDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(ErrorRate as float)").\
                    where(((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))).\
                    groupBy("Lane", "Tile").\
                    agg(
                        sf.mean(col("ErrorRate")).alias("mean_ErrorRate")).\
                    orderBy("Lane", "Tile")
        else:
            raise ValueError(
                f"Read cycles are unknown: {len(read_list)}")
        return errorTileDF
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 2103, 'Cycle': 1, 'ErrorRate': 0.17},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 2, 'ErrorRate': 0.05},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 3, 'ErrorRate': 0.02},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 4, 'ErrorRate': 0.05}]))
t = \
    get_mean_error_rate_per_tile(
        errorDF=t_df,
        reads_stat=reads_stat)
t_pdf = t.toPandas()
assert 'mean_ErrorRate' in t_pdf.columns
assert f"{float(t_pdf['mean_ErrorRate'].values[0]):.3f}" == f"{float(np.mean([0.17, 0.05, 0.02, 0.05])):.3f}"

In [None]:
def get_cluster_and_density_per_tile(
        tileDF: DataFrame) -> DataFrame:
    try:
        ## fill na
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        ## get pct cluster and density PF
        tileTileDF = \
            tileDF.\
            selectExpr(
                "cast(Lane as int)",
                "cast(Tile as int)",
                "cast(Read as int)",
                "cast(ClusterCount as decimal)",
                "cast(ClusterCountPF as decimal)",
                "cast(Density as decimal)",
                "cast(DensityPF as decimal)").\
            groupBy("Lane", "Tile").\
            agg(
                (sf.sum(col("ClusterCountPF"))/sf.sum(col("ClusterCount"))).alias("PCT_ClusterCountPF"),
                (sf.sum(col("DensityPF"))/sf.sum(col("Density"))).alias("PCT_DensityPF")).\
            orderBy("Lane", "Tile")
        return tileTileDF
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'Read': 1, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010, 'Density': 2961260, 'DensityPF': 2482260},
        {'Lane': 1, 'Tile': 1101, 'Read': 4, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010, 'Density': 2961260, 'DensityPF': 2482260}]))
t = get_cluster_and_density_per_tile(
        tileDF=t_df)
t_pdf = t.toPandas()
assert 'PCT_ClusterCountPF' in t_pdf.columns
assert 'PCT_DensityPF' in t_pdf.columns
assert float(t_pdf['PCT_ClusterCountPF'].values[0]) == 0.838243847601358782
assert float(t_pdf['PCT_DensityPF'].values[0]) == 0.8382445310442177
# print(t_pdf.to_dict(orient='records'))

In [None]:
def get_pct_q30_score_per_tile(
        qDF: DataFrame,
        reads_stat: pd.DataFrame) -> DataFrame:
    try:
        read_list = \
            get_read_cycles(reads_stat=reads_stat)
        ## set q30 bins
        qbin_count = \
            len([c for c in qDF.columns if c.startswith('Bin_')])
        if qbin_count > 3:
            qbin_list = [f"Bin_{c}" for c in range(30, 51)]
        else:
            qbin_list = ["Bin_3",]
        non_qbin_list = [
            c for c in qDF.columns
                if c.startswith('Bin_') and c not in qbin_list ]
        ## fill na
        qDF = qDF.na.fill('0')
        qDF = qDF.na.replace('nan', '0')
        if len(read_list) == 2:
            qTileDf = \
                qDF.\
                selectExpr(
                    *(f"cast({c} as long) as {c}" for c in qDF.columns)).\
                filter(col("Tile") > 0).\
                where(
                    ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))|\
                    ((col("Cycle") >= read_list[1][0])&(col("Cycle") < read_list[1][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    *(sf.sum(col(c)).alias(c)
                        for c in qDF.columns 
                            if c.startswith('Bin_')),).\
                withColumn('non_q_bin_total', sf.expr("+".join(non_qbin_list))).\
                withColumn('q_bin_total', sf.expr("+".join(qbin_list))).\
                select(
                    col("Lane"),
                    col("Tile"),
                    (col("q_bin_total")/(col("non_q_bin_total")+col("q_bin_total"))).alias("PCT_Q30")).\
                orderBy("Lane", "Tile")
        elif len(read_list) == 1:
            qTileDf = \
                qDF.\
                selectExpr(
                    *(f"cast({c} as long) as {c}" for c in qDF.columns)).\
                filter(col("Tile") > 0).\
                where(((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    *(sf.sum(col(c)).alias(c) 
                        for c in qDF.columns 
                            if c.startswith('Bin_')),).\
                withColumn('non_q_bin_total', sf.expr("+".join(non_qbin_list))).\
                withColumn('q_bin_total', sf.expr("+".join(qbin_list))).\
                select(
                    col("Lane"),
                    col("Tile"),
                    (col("q_bin_total")/(col("non_q_bin_total")+col("q_bin_total"))).alias("PCT_Q30")).\
                orderBy("Lane", "Tile")
        else:
            raise ValueError(
                f"Read cycles are unknown: {len(read_list)}")
        return qTileDf
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': '1', 'Tile': '1101', 'Cycle': '1', 'Bin_1': '55649', 'Bin_2': '84702', 'Bin_3': '3275160'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '2', 'Bin_1': '55649', 'Bin_2': '96568', 'Bin_3': '3263293'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '3', 'Bin_1': '30280', 'Bin_2': '65061', 'Bin_3': '3320171'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '4', 'Bin_1': '31916', 'Bin_2': '58105', 'Bin_3': '3325490'}]))
t = get_pct_q30_score_per_tile(
        qDF=t_df,
        reads_stat=reads_stat)
t_pdf = t.toPandas()
assert 'PCT_Q30' in t_pdf.columns
assert float(t_pdf['PCT_Q30'].values[0]) == 0.9650176796385666
# t_pdf.to_dict(orient='records')

In [None]:
def get_occupied_pct_per_tile(
        tileDF: DataFrame,
        extTileDF: DataFrame) -> DataFrame:
    try:
        ## fill na
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        ## fill na
        extTileDF = extTileDF.na.fill('0')
        extTileDF = extTileDF.na.replace('nan', '0')
        ## calculate total cluster count per tile
        tileTileDF = \
            tileDF.\
            selectExpr(
                "cast(Lane as int)",
                "cast(Tile as int)",
                "cast(Read as int)",
                "cast(ClusterCount as decimal)",
                "cast(ClusterCountPF as decimal)").\
            filter(col("Read")==1).\
            groupBy("Lane", "Tile").\
            agg(
                sf.sum(col("ClusterCount")).alias("total_ClusterCount"),
                sf.sum(col("ClusterCountPF")).alias("total_ClusterCountPF")).\
            orderBy("Lane", "Tile")
        ## join extendedTile and Tile
        joinExpression = \
            (tileTileDF['Lane'] == extTileDF['Lane'])&\
            (tileTileDF['Tile'] == extTileDF['Tile'])
        joinType = "inner"
        ## set alias
        tileTileDF_alias = tileTileDF.alias('t')
        extTileDF_alias = extTileDF.alias('et')
        ## join DFs
        joined_tiles = \
            tileTileDF_alias.\
            join(extTileDF_alias, joinExpression, joinType).\
            select(
                "t.Lane",
                "t.Tile",
                "t.total_ClusterCount",
                "t.total_ClusterCountPF",
                "et.OccupiedCount")
        ## calculate pct occupied
        tiles_pct_occupied = \
            joined_tiles.\
            selectExpr(
                "Lane", "Tile",
                "cast(OccupiedCount as decimal) / total_ClusterCount as PCT_Occupied",
                "total_ClusterCountPF / total_ClusterCount as PCT_ClusterCountPF")
        return tiles_pct_occupied
    except Exception as e:
        raise ValueError(e)

In [None]:
t_tileDF = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'Read': 1, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010},
        {'Lane': 1, 'Tile': 1101, 'Read': 4, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010}]))
t_etileDF = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'OccupiedCount': 3615010}]))
t = get_occupied_pct_per_tile(t_tileDF, t_etileDF)
t_pdf = t.toPandas()
assert 'PCT_Occupied' in t_pdf.columns
assert float(t_pdf['PCT_Occupied'].values[0]) == 3615010/4091900
assert 'PCT_ClusterCountPF' in t_pdf.columns
assert float(t_pdf['PCT_ClusterCountPF'].values[0]) == 3430010/4091900
# t_pdf.to_dict(orient='records')

In [None]:
def count_phasing(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.DataFrame]:
    for pdf in iterator:
      cycle = pdf.Cycle
      phasing = pdf.Phasing
      prephasing = pdf.Prephasing
      slope_p, offset_p = np.polyfit(cycle.values[0], np.float32(phasing.values[0]), 1)
      slope_pr, offset_pr = np.polyfit(cycle.values[0], np.float32(prephasing.values[0]), 1)
      pdf = pdf.assign(slope_p=slope_p)
      pdf = pdf.assign(offset_p=offset_p)
      pdf = pdf.assign(slope_pr=slope_pr)
      pdf = pdf.assign(offset_pr=offset_pr)
      yield pdf

def get_phase_data_per_tile(
        phaseDF: DataFrame,
        reads_stat: pd.DataFrame) -> DataFrame:
    try:
        read_list = \
            get_read_cycles(reads_stat=reads_stat)
        ## fill na
        phaseDF = phaseDF.na.fill('0')
        phaseDF = phaseDF.na.replace('nan', '0')
        if len(read_list) == 2:
            phaseTileDF = \
                phaseDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(Phasing as double)",
                    "cast(Prephasing as double)").\
                where(
                    ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))|\
                    ((col("Cycle") >= read_list[1][0])&(col("Cycle") < read_list[1][1]))).\
                withColumn("NewCycle",
                    sf.when(col("Cycle") >= read_list[1][0], col("Cycle") - (read_list[1][0] -1)).\
                            when(col("Cycle") < read_list[1][0], col("Cycle"))).\
                groupBy("Lane", "Tile", "NewCycle").\
                agg(
                    sf.mean("Phasing").alias("Phasing"),
                    sf.mean("Prephasing").alias("Prephasing")
                ).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.collect_list("NewCycle").alias("Cycle"),
                    sf.collect_list("Phasing").alias("Phasing"),
                    sf.collect_list("Prephasing").alias("Prephasing"))
        elif len(read_list) == 1:
            phaseTileDF = \
                phaseDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    "cast(Phasing as double)",
                    "cast(Prephasing as double)").\
                filter((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1])).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.collect_list("Cycle").alias("Cycle"),
                    sf.collect_list("Phasing").alias("Phasing"),
                    sf.collect_list("Prephasing").alias("Prephasing"))
        else:
            raise ValueError(
                f"Read cycles are unknown: {len(read_list)}")
        ## get new df schema
        phaseTileDF_schema = \
            phaseTileDF.\
            withColumn("slope_p", lit(0.0).cast(DoubleType())).\
            withColumn("offset_p", lit(0.0).cast(DoubleType())).\
            withColumn("slope_pr", lit(0.0).cast(DoubleType())).\
            withColumn("offset_pr", lit(0.0).cast(DoubleType())).\
            schema
        ## calculate phasing data using Pandas 
        phaseDF_calc = \
            phaseTileDF.\
            mapInPandas(count_phasing, schema=phaseTileDF_schema)
        return phaseDF_calc
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(
        pd.DataFrame([{'Lane': 1, 'Tile': 1101, 'Cycle': 1, 'Phasing': 3.0, 'Prephasing': 1.0},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 2, 'Phasing': 1.25, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 3, 'Phasing': 1.0, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 4, 'Phasing': 1.75, 'Prephasing': 1.5},
                      {'Lane': 1, 'Tile': 1102, 'Cycle': 1, 'Phasing': 3.1, 'Prephasing': 1.0},
                      {'Lane': 1, 'Tile': 1102, 'Cycle': 2, 'Phasing': 1.5, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1102, 'Cycle': 3, 'Phasing': 1.5, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1102, 'Cycle': 4, 'Phasing': 1.7, 'Prephasing': 1.5}]))
t = get_phase_data_per_tile(
        phaseDF=t_df,
        reads_stat=reads_stat)
s_p, i_p = np.polyfit([1, 2, 3, 4], np.float32([3.0999999046325684, 1.5, 1.5, 1.7000000476837158]), 1)
s_pr, i_pr = np.polyfit([1,2,3,4], np.float32([1.0, 0.75, 0.75, 1.5]), 1)
t_pdf = t.toPandas()
# print(s_p, i_p)
# t.show(truncate=False)
t_pdf = t_pdf[t_pdf['Tile'].astype(int)==1101]
# print(t_pdf.to_dict(orient='records'))
assert 'slope_p' in t_pdf.columns
assert 'slope_pr' in t_pdf.columns
assert 'offset_p' in t_pdf.columns
assert 'offset_pr' in t_pdf.columns
# print(f"{float(t_pdf['offset_p'].values[0]):.1f} , {i_p:.1f}")
assert f"{float(t_pdf['slope_p'].values[0]):.1f}" == f"{s_p:.1f}"
assert f"{float(t_pdf['offset_p'].values[0]):.1f}" == f"{i_p:.1f}"
assert f"{float(t_pdf['slope_pr'].values[0]):.1f}" == f"{s_pr:.1f}"
assert f"{float(t_pdf['offset_pr'].values[0]):.1f}" == f"{i_pr:.1f}"
t_pdf = t.toPandas()
t_pdf = t_pdf[t_pdf['Tile'].astype(int)==1102]
s_p, i_p = np.polyfit([1,2,3,4], [3.1, 1.5, 1.5, 1.7], 1)
assert f"{float(t_pdf['slope_p'].values[0]):.1f}" == f"{s_p:.1f}"
assert f"{float(t_pdf['offset_p'].values[0]):.1f}" == f"{i_p:.1f}"
# print(t_pdf.to_dict(orient='records'))
t_df = \
    spark.createDataFrame(
        pd.DataFrame([{'Lane': 1, 'Tile': 1101, 'Cycle': 1, 'Phasing': 3, 'Prephasing': 1},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 2, 'Phasing': 1.25, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 3, 'Phasing': 1, 'Prephasing': 0.75},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 4, 'Phasing': 1.75, 'Prephasing': 1.5},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 49, 'Phasing': 2, 'Prephasing': 2},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 50, 'Phasing': 1, 'Prephasing': 1},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 51, 'Phasing': 1.25, 'Prephasing': 1.75},
                      {'Lane': 1, 'Tile': 1101, 'Cycle': 52, 'Phasing': 2, 'Prephasing': 2}]))
s_p, i_p = \
    np.polyfit(
        [1,2,3,4], 
        [np.mean([3.0, 2.0]), np.mean([1.25, 1.0]), np.mean([1.0, 1.25]), np.mean([1.75, 2.0])], 1)
t = get_phase_data_per_tile(
        phaseDF=t_df,
        reads_stat=reads_stat)
t_pdf = t.toPandas()
assert 'slope_p' in t_pdf.columns
assert 'slope_pr' in t_pdf.columns
assert 'offset_p' in t_pdf.columns
assert 'offset_pr' in t_pdf.columns
# print(s_p, i_p)
# print(t_pdf.to_dict(orient="records"))
assert f"{float(t_pdf['slope_p'].values[0]):.1f}" == f"{s_p:.1f}"
assert f"{float(t_pdf['offset_p'].values[0]):.1f}" == f"{i_p:.1f}"

In [None]:
def get_intensity_c1_per_tile(
        extractionDF: DataFrame,
        reads_stat: pd.DataFrame) -> DataFrame:
    try:
        read_list = \
            get_read_cycles(reads_stat=reads_stat)
        ## fill na
        extractionDF = extractionDF.na.fill('0')
        extractionDF = extractionDF.na.replace('nan', '0')
        ## get intensity col name
        intensity_col = "MaxIntensity_A"
        if "MaxIntensity_blue" in extractionDF.columns:
            intensity_col = "MaxIntensity_blue"
        if "MaxIntensity_RED" in extractionDF.columns:
            intensity_col = "MaxIntensity_RED"
        if len(read_list) == 2:
            extractionTileDF = \
                extractionDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    f"cast({intensity_col} as long)").\
                    filter(col("Tile") > 0).\
                where(
                    ((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))|\
                    ((col("Cycle") >= read_list[1][0])&(col("Cycle") < read_list[1][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.mean(col(intensity_col)).alias("intensity_c1")).\
                orderBy("Lane", "Tile")
        elif len(read_list) == 1:
            extractionTileDF = \
                extractionDF.\
                selectExpr(
                    "cast(Lane as int)",
                    "cast(Tile as int)",
                    "cast(Cycle as int)",
                    f"cast({intensity_col} as long)").\
                    filter(col("Tile") > 0).\
                where(((col("Cycle") >= read_list[0][0])&(col("Cycle") < read_list[0][1]))).\
                groupBy("Lane", "Tile").\
                agg(
                    sf.mean(col(intensity_col)).alias("intensity_c1")).\
                orderBy("Lane", "Tile")
        else:
            raise ValueError(
                f"Read cycles are unknown: {len(read_list)}")
        return extractionTileDF
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': '1', 'Tile': '1101', 'Cycle': '1', 'MaxIntensity_RED': '2264'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '2', 'MaxIntensity_RED': '2217'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '3', 'MaxIntensity_RED': '2295'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '4', 'MaxIntensity_RED': '2211'}]))
t = \
    get_intensity_c1_per_tile(
        extractionDF=t_df,
        reads_stat=reads_stat)
t_pdf = t.toPandas()
assert 'intensity_c1' in t_pdf.columns
assert float(t_pdf['intensity_c1'].values[0]) == np.mean([2264, 2217, 2295, 2211])
# t_pdf.to_dict(orient='records')

In [None]:
def merge_all_tile_data(
        lane_count: int,
        intensityC1TileDF: DataFrame,
        phaseCalcDF: DataFrame,
        tileOccupiedDF: DataFrame,
        q30PctDF: DataFrame,
        tileClusterAndDensityDF: DataFrame,
        errorRateTileDF: DataFrame,
        meanCalledintensityDF: DataFrame
        ) -> DataFrame:
    try:
        ## intensity c1
        intensityC1TileDF = \
            intensityC1TileDF.\
            repartition(lane_count, col("Lane"))
        intensityC1TileDF.\
            createOrReplaceTempView("intensityC1")
        ## phase
        phaseCalcDF = \
            phaseCalcDF.\
            repartition(lane_count, col("Lane"))
        phaseCalcDF.\
            createOrReplaceTempView("phasing")
        ## occupied
        tileOccupiedDF = \
            tileOccupiedDF.\
            repartition(lane_count, col("Lane"))
        tileOccupiedDF.\
            createOrReplaceTempView("tile_occupied")
        ## q30pct
        q30PctDF = \
            q30PctDF.\
            repartition(lane_count, col("Lane"))
        q30PctDF.\
            createOrReplaceTempView("qual")
        ## cluster and density
        tileClusterAndDensityDF = \
            tileClusterAndDensityDF.\
            repartition(lane_count, col("Lane"))
        tileClusterAndDensityDF.\
            createOrReplaceTempView("tile")
        ## error rate
        errorRateTileDF = \
            errorRateTileDF.\
            repartition(lane_count, col("Lane"))
        errorRateTileDF.\
            createOrReplaceTempView("error")
        ## mean called intensity
        meanCalledintensityDF = \
            meanCalledintensityDF.\
            repartition(lane_count, col("Lane"))
        meanCalledintensityDF.\
            createOrReplaceTempView("cint")
        ## join all
        mergedDF = \
            spark.sql("""
                SELECT
                tile.Lane,
                tile.Tile,
                tile.PCT_ClusterCountPF,
                tile.PCT_DensityPF,
                cint.mean_CalledCount_A,
                cint.mean_CalledCount_T,
                cint.mean_CalledCount_G,
                cint.mean_CalledCount_C,
                qual.PCT_Q30,
                error.mean_ErrorRate,
                tile_occupied.PCT_Occupied,
                intensityC1.intensity_c1,
                phasing.slope_p,
                phasing.offset_p,
                phasing.slope_pr,
                phasing.offset_pr
                FROM
                tile
                JOIN cint ON tile.Lane=cint.Lane AND tile.Tile=cint.Tile
                JOIN qual ON tile.Lane=qual.Lane AND tile.Tile=qual.Tile
                JOIN phasing ON tile.Lane=phasing.Lane AND tile.Tile=phasing.Tile
                JOIN intensityC1 ON tile.Lane=intensityC1.Lane AND tile.Tile=intensityC1.Tile
                LEFT JOIN tile_occupied ON tile.Lane=tile_occupied.Lane AND tile.Tile=tile_occupied.Tile
                LEFT JOIN error ON tile.Lane=error.Lane AND tile.Tile=error.Tile
                """)
        ## fill na
        mergedDF = mergedDF.na.fill(0.0)
        return mergedDF
    except Exception as e:
        raise ValueError(e)

In [None]:
def read_interop_metrics_and_merge_per_tile(df_dict: list, runinfo_xml: str) -> DataFrame:
    try:
#         df_dict = get_interop_data_frames(metrics_list)
        run_id, flowcell_id, instrument_id, lane_count, reads_stat, _ = \
            read_runinfo_xml(runinfo_xml)
        intensityC1TileDF = \
            get_intensity_c1_per_tile(
                extractionDF=df_dict.get("Extraction"),
                reads_stat=reads_stat)
        phaseCalcDF = \
            get_phase_data_per_tile(
                phaseDF=df_dict.get("EmpiricalPhasing"),
                reads_stat=reads_stat)
        if 'ExtendedTile' in df_dict:
            tileOccupiedDF = \
                get_occupied_pct_per_tile(
                    tileDF=df_dict.get("Tile"),
                    extTileDF=df_dict.get("ExtendedTile"))
        else:
            tileOccupiedDF = \
                spark.createDataFrame(
                    pd.DataFrame([], columns=['Lane', 'Tile', 'PCT_Occupied']))
        q30PctDF = \
            get_pct_q30_score_per_tile(
                qDF=df_dict.get("Q"),
                reads_stat=reads_stat)
        tileClusterAndDensityDF = \
            get_cluster_and_density_per_tile(
                tileDF=df_dict.get("Tile"))
        if 'Error' in df_dict:
            errorRateTileDF = \
                get_mean_error_rate_per_tile(
                    errorDF=df_dict.get("Error"),
                    reads_stat=reads_stat)
        else:
            errorRateTileDF = \
                spark.createDataFrame(
                    pd.DataFrame([], columns=['Lane', 'Tile', 'mean_ErrorRate']),
                schema="Lane INT, Tile INT, mean_ErrorRate FLOAT")
        meanCalledintensityDF = \
            get_mean_callout_intensities_per_tile(
                correctedIntDF=df_dict.get("CorrectedInt"),
                reads_stat=reads_stat)
        mergedDF = \
            merge_all_tile_data(
                lane_count=lane_count,
                intensityC1TileDF=intensityC1TileDF,
                phaseCalcDF=phaseCalcDF,
                tileOccupiedDF=tileOccupiedDF,
                q30PctDF=q30PctDF,
                tileClusterAndDensityDF=tileClusterAndDensityDF,
                errorRateTileDF=errorRateTileDF,
                meanCalledintensityDF=meanCalledintensityDF)
        return mergedDF
    except Exception as e:
        raise ValueError(e)

In [None]:
def get_q30_and_yield_values(
        qDF: DataFrame,
        reads_stat: pd.DataFrame,
        lane_count: int) -> \
            Tuple[float, list[dict], list[dict], float, list[dict], list[dict]]:
    try:
        avg_pct_q30 = 0.0
        list_pct_q30_rg = []
        list_pct_q30_rg_lane = []
        final_yield = 0.0
        list_yield_rg = []
        list_yield_lane_rg = []
        qDF = qDF.na.fill('0')
        qDF = qDF.na.replace('nan', '0')
        ## set q30 bins
        qbin_count = \
            len([c for c in qDF.columns if c.startswith('Bin_')])
        if qbin_count > 3:
            qbin_list = [f"sum(Bin_{c})" for c in range(30, 51)]
        else:
            qbin_list = ["sum(Bin_3)",]
        ## calculate avg q30
        d = \
            qDF.\
            selectExpr(
                *(f"cast({c} as long) as {c}" if c.startswith('Bin_') else f"cast({c} as int)" for c in qDF.columns)).\
            select(
                *(sf.sum(col(c)) for c in qDF.columns if c.startswith('Bin_')),).\
            toPandas().\
            to_dict(orient='records')
        total_bases = \
            sum([int(v) for k,v in d[0].items() if k!='Lane'])
        q30_bases = \
            sum([int(v) for k,v in d[0].items()
                     if k!='Lane' and k in qbin_list])
        avg_q30 = \
            f"{q30_bases/total_bases * 100 :.2f}"
        ## calculate total yield
        # total_yield = total_bases/2
        # final_yield = f'{total_yield/1000000000:.2f}' 
        ## rg wise sum
        for entry in reads_stat.to_dict(orient='records'):
            read_id = entry.get('read_id')
            read_cycle = entry.get('cycles')
            start_cycle = entry.get('start_cycle')
            index_read = entry.get('index_read')
            finish_cycle = entry.get('finish_cycle')
            ## filter df for lane and cycle
            temp_qDF = \
                qDF.\
                withColumn('Cycle', col('Cycle').cast('int')).\
                filter(col('Cycle')>=start_cycle).\
                filter(col('Cycle')<finish_cycle)
            ## get count
            d = \
                temp_qDF.\
                selectExpr(
                    *(f"cast({c} as long) as {c}"
                        if c.startswith('Bin_') else f"cast({c} as int)"
                            for c in temp_qDF.columns)).\
                selectExpr(*(f"sum({c})" for c in temp_qDF.columns if c.startswith('Bin_'))).\
                toPandas().to_dict(orient='records')
            total_bases = \
                sum([int(v) for k,v in d[0].items() if k!='Lane'])
            q30_bases = \
                sum([int(v) for k,v in d[0].items()
                        if k!='Lane' and k in qbin_list])
            pct_q30 = \
                f"{q30_bases/total_bases * 100 :.2f}"
            total_yield = total_bases/2
            total_yield = f'{total_yield/1000000000:.2f}'
            list_pct_q30_rg.append({'Read': read_id, 'PCT_Q30': pct_q30})
            list_yield_rg.append({'Read': read_id, 'Yield': total_yield})
            final_yield += float(total_yield)
        ## lane and rg wise sum
        for lane_id in range(1, lane_count+1):
            for entry in reads_stat.to_dict(orient='records'):
                read_id = entry.get('read_id')
                read_cycle = entry.get('cycles')
                start_cycle = entry.get('start_cycle')
                index_read = entry.get('index_read')
                finish_cycle = entry.get('finish_cycle')
                ## filter df for lane and cycle
                temp_qDF = \
                    qDF.\
                    withColumn('Cycle', col('Cycle').cast('int')).\
                    withColumn('Lane', col('Lane').cast('int')).\
                    filter(col('Lane')==lane_id).\
                    filter(col('Cycle')>=start_cycle).\
                    filter(col('Cycle')<finish_cycle)
                ## get count
                d = \
                    temp_qDF.\
                    selectExpr(
                        *(f"cast({c} as long) as {c}"
                              if c.startswith('Bin_') else f"cast({c} as int)"
                                  for c in temp_qDF.columns)).\
                    groupBy('Lane').\
                    agg(
                        *(sf.sum(col(c)) 
                              for c in temp_qDF.columns 
                                  if c.startswith('Bin_')),).\
                    toPandas().to_dict(orient='records')
                total_bases = \
                    sum([int(v) for k,v in d[0].items() if k!='Lane'])
                q30_bases = \
                    sum([int(v) for k,v in d[0].items()
                         if k!='Lane' and k in qbin_list])
                pct_q30 = \
                    f"{q30_bases/total_bases * 100 :.2f}"
                total_yield = total_bases/2
                total_yield = f'{total_yield/1000000000:.2f}'
                list_pct_q30_rg_lane.append({'Lane': lane_id, 'Read': read_id, 'PCT_Q30': pct_q30})
                list_yield_lane_rg.append({'Lane': lane_id, 'Read': read_id, 'Yield': total_yield})
        return avg_q30, list_pct_q30_rg, list_pct_q30_rg_lane, final_yield, list_yield_rg, list_yield_lane_rg
    except Exception as e:
        raise ValueError(e)

In [None]:
reads_stat = \
    pd.DataFrame([
        {'read_id': 1, 'cycles': 2, 'start_cycle': 1, 'finish_cycle': 2, 'index_read': 'N'},
        {'read_id': 2, 'cycles': 2, 'start_cycle': 3, 'finish_cycle': 4, 'index_read': 'N'}])
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': '1', 'Tile': '1101', 'Cycle': '1', 'Bin_1': '5564900', 'Bin_2': '8470200', 'Bin_3': '327516000'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '2', 'Bin_1': '5564900', 'Bin_2': '9656800', 'Bin_3': '326329300'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '3', 'Bin_1': '3028000', 'Bin_2': '6506100', 'Bin_3': '332017100'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '4', 'Bin_1': '3191600', 'Bin_2': '5810500', 'Bin_3': '332549000'}]))
avg_q30, list_pct_q30_rg, list_pct_q30_rg_lane, final_yield, list_yield_rg, list_yield_lane_rg = \
    get_q30_and_yield_values(
        qDF=t_df,
        reads_stat=reads_stat,
        lane_count=1)
assert str(float(avg_q30)) == f"{(327516000+332017100)/(5564900+8470200+327516000+3028000+6506100+332017100)*100:.1f}"
assert str(final_yield) == f"{(5564900+8470200+327516000+3028000+6506100+332017100)/2/1000000000:.2f}"
assert str(list_pct_q30_rg[0].get('PCT_Q30')) == f"{327516000/(5564900+8470200+327516000)*100:.2f}"
assert str(list_yield_rg[0].get('Yield')) == f"{(5564900+8470200+327516000)/2/1000000000:.2f}"

In [None]:
def get_calculate_phasing_scores(
        phaseDF: DataFrame,
        reads_stat: pd.DataFrame) -> list:
    try:
        phaseDF = phaseDF.na.fill('0')
        phaseDF = phaseDF.na.replace('nan', '0')
        phasing_values = list()
        for row in reads_stat.to_dict(orient='records'):
            read_id = row.get('read_id')
            start_cycle = row.get('start_cycle')
            finish_cycle = row.get('finish_cycle')
            index_read = row.get('index_read')
            if index_read == 'N':
                df = \
                phaseDF.\
                selectExpr(*(f"cast({c} as float)" for c in ("Lane", "Tile", "Cycle", "Phasing", "Prephasing"))).\
                filter(col("Cycle")>=start_cycle).filter(col("Cycle")<finish_cycle).\
                groupBy('Lane', 'Cycle').\
                agg(
                    sf.sum("Phasing").alias("total_phasing"),
                    sf.sum("Prephasing").alias("total_prephasing"),
                    sf.mean("Phasing").alias("mean_phasing"),
                    sf.mean("Prephasing").alias("mean_prephasing"),
                    sf.median("Phasing").alias("median_phasing"),
                    sf.median("Prephasing").alias("median_prephasing")).\
                toPandas()
                for lane, l_data in df.groupby('Lane'):
                    t_l_data = l_data.copy()
                    t_l_data['Cycle'] = t_l_data['Cycle'] - (start_cycle-1)
                    slope_p, offset_p = np.polyfit(t_l_data["Cycle"], t_l_data["mean_phasing"], 1)
                    slope_pr, offset_pr = np.polyfit(t_l_data["Cycle"], t_l_data["mean_prephasing"], 1)
                    row = {
                        'Lane': lane,
                        'Read': read_id,
                        'Phasing_slope': slope_p,
                        'Phasing_offset': offset_p,
                        'Prephasing_slope': slope_pr,
                        'Prephasing_offset': offset_pr}
                    phasing_values.append(row)
        return phasing_values
    except Exception as e:
        raise ValueError(e)

In [None]:
reads_stat = \
    pd.DataFrame([
        {'read_id': 1, 'cycles': 4, 'start_cycle': 1, 'finish_cycle': 4, 'index_read': 'N'},
        {'read_id': 2, 'cycles': 4, 'start_cycle': 5, 'finish_cycle': 8, 'index_read': 'N'}])
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'Cycle': 1, 'Phasing': 3, 'Prephasing': 1},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 2, 'Phasing': 1.25, 'Prephasing': 0.75},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 3, 'Phasing': 1, 'Prephasing': 0.75},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 4, 'Phasing': 1.75, 'Prephasing': 1.5},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 5, 'Phasing': 2, 'Prephasing': 2},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 6, 'Phasing': 1, 'Prephasing': 1},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 7, 'Phasing': 1.25, 'Prephasing': 1.75},
        {'Lane': 1, 'Tile': 1101, 'Cycle': 8, 'Phasing': 2, 'Prephasing': 2}]))
phasing_values = \
    get_calculate_phasing_scores(
        phaseDF=t_df,
        reads_stat=reads_stat)
phasing_values_df = \
    pd.DataFrame(phasing_values)
read1 = \
    phasing_values_df[phasing_values_df['Read']==1]
r1_p_s, r1_p_o = \
    np.polyfit([1, 2, 3], [3.0, 1.25, 1.0], 1)
r1_pr_s, r1_pr_o = \
    np.polyfit([1, 2, 3], [1.0, 0.75, 0.75], 1)
# assert read1['Phasing_slope'].values[0] == r1_p_s
# assert read1['Phasing_offset'].values[0] == r1_p_o
# assert read1['Prephasing_slope'].values[0] == r1_pr_s
# assert read1['Prephasing_offset'].values[0] == r1_pr_o

In [None]:
def get_total_pct_occupied(
        tileDF: DataFrame,
        etileDF: DataFrame) -> float:
    try:
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        etileDF = etileDF.na.fill('0')
        etileDF = etileDF.na.replace('nan', '0')
        total_ClusterCount = \
            tileDF.\
            selectExpr("cast(Read as int)", "cast(ClusterCount as decimal)").\
            filter(col('Read')==1).\
            select(sf.sum("ClusterCount").alias("total_ClusterCount")).\
            collect()
        total_OccupiedCount = \
            etileDF.\
            selectExpr("cast(OccupiedCount as decimal)").\
            select(sf.sum("OccupiedCount").alias("total_OccupiedCount")).\
            collect()
        pct_occupied = \
            float(total_OccupiedCount[0]["total_OccupiedCount"]/total_ClusterCount[0]["total_ClusterCount"]*100)
        return pct_occupied
    except Exception as e:
        raise ValueError(e)

In [None]:
t_tileDF = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'Read': 1, 'ClusterCount': 4091900},
        {'Lane': 1, 'Tile': 1101, 'Read': 4, 'ClusterCount': 4091900}]))
t_etileDF = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'OccupiedCount': 3615010}]))
pct_occupied = \
    get_total_pct_occupied(
        tileDF=t_tileDF,
        etileDF=t_etileDF)
assert pct_occupied == 3615010/4091900*100

In [None]:
def get_intensity_cycle_1_value(
        extDF: DataFrame,
        reads_stat: pd.DataFrame) -> Tuple[list, list]:
    try:
        extDF = extDF.na.fill('0')
        extDF = extDF.na.replace('nan', '0')
        intensity_list = list()
        intensity_rg_list = list()
        if "MaxIntensity_blue" in extDF.columns:
            intensity_col = "MaxIntensity_blue"
        if "MaxIntensity_RED" in extDF.columns:
            intensity_col = "MaxIntensity_RED"
        if "MaxIntensity_A" in extDF.columns:
            intensity_col = "MaxIntensity_A"
        for row in reads_stat.to_dict(orient='records'):
            read_id = row.get('read_id')
            start_cycle = row.get('start_cycle')
            pdf = \
                extDF.\
                filter(col("Cycle")==start_cycle).\
                groupBy('Lane').\
                agg(sf.mean(intensity_col).alias("Intensity_cycle_1")).\
                toPandas()
            pdf['Read'] = read_id
            intensity_list.extend(pdf.to_dict(orient='records'))
            rg_pdf = \
                extDF.\
                filter(col("Cycle")==start_cycle).\
                select(sf.mean(intensity_col).alias("Intensity_cycle_1")).\
                toPandas()
            rg_pdf['Read'] = read_id
            intensity_rg_list.extend(rg_pdf.to_dict(orient='records'))
        return intensity_list, intensity_rg_list
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': '1', 'Tile': '1101', 'Cycle': '1', 'MaxIntensity_RED': '2264'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '2', 'MaxIntensity_RED': '2217'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '3', 'MaxIntensity_RED': '2295'},
        {'Lane': '1', 'Tile': '1101', 'Cycle': '4', 'MaxIntensity_RED': '2211'}]))
reads_stat = \
    pd.DataFrame([
        {'read_id': 1, 'cycles': 2, 'start_cycle': 1, 'finish_cycle': 2, 'index_read': 'N'},
        {'read_id': 2, 'cycles': 2, 'start_cycle': 3, 'finish_cycle': 4, 'index_read': 'N'}])
intensity_list, intensity_rg_list = \
    get_intensity_cycle_1_value(
        extDF=t_df,
        reads_stat=reads_stat)
intensity_list_df = pd.DataFrame(intensity_list)
intensity_list_df_r1 = intensity_list_df[intensity_list_df['Read']==1]
assert intensity_list_df_r1['Intensity_cycle_1'].values[0] == 2264
intensity_rg_list_df = pd.DataFrame(intensity_rg_list)
intensity_rg_list_df_r1 = intensity_rg_list_df[intensity_rg_list_df['Read']==1]
assert intensity_rg_list_df_r1['Intensity_cycle_1'].values[0] == 2264

In [None]:
def get_error_rate_value(
        eDF: DataFrame,
        reads_stat: pd.DataFrame) -> Tuple[list, list]:
    try:
        eDF = eDF.na.fill('0')
        eDF = eDF.na.replace('nan', '0')
        error_rate_list = list()
        error_rate_rg_list = list()
        for row in reads_stat.to_dict(orient='records'):
            read_id = row.get('read_id')
            start_cycle = row.get('start_cycle')
            finish_cycle = row.get('finish_cycle')
            index_read = row.get('index_read')
            if index_read == 'N':
                pdf = \
                    eDF.\
                    selectExpr(
                        "cast(Lane as int)",
                        "cast(Tile as int)",
                        "cast(Cycle as int)",
                        "cast(ErrorRate as double)").\
                    filter(col("Cycle")>=start_cycle).\
                    filter(col("Cycle")<finish_cycle).\
                    groupBy('Lane').\
                    agg(sf.mean("ErrorRate").alias("ErrorRate")).\
                    toPandas()
                pdf['Read'] = read_id
                error_rate_list.extend(pdf.to_dict(orient='records'))
                rg_pdf = \
                    eDF.\
                    selectExpr(
                        "cast(Lane as int)",
                        "cast(Tile as int)",
                        "cast(Cycle as int)",
                        "cast(ErrorRate as double)").\
                    filter(col("Cycle")>=start_cycle).\
                    filter(col("Cycle")<finish_cycle).\
                    select(sf.mean("ErrorRate").alias("ErrorRate")).\
                    toPandas()
                rg_pdf['Read'] = read_id
                error_rate_rg_list.extend(rg_pdf.to_dict(orient='records'))
        return error_rate_list, error_rate_rg_list
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 2103, 'Cycle': 1, 'ErrorRate': 0.17},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 2, 'ErrorRate': 0.05},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 3, 'ErrorRate': 0.02},
        {'Lane': 1, 'Tile': 2103, 'Cycle': 4, 'ErrorRate': 0.05}]))
reads_stat = \
    pd.DataFrame([
        {'read_id': 1, 'cycles': 4, 'start_cycle': 1, 'finish_cycle': 2, 'index_read': 'N'},
        {'read_id': 2, 'cycles': 4, 'start_cycle': 3, 'finish_cycle': 8, 'index_read': 'N'}])
error_rate_list, error_rate_rg_list = \
    get_error_rate_value(
        eDF=t_df,
        reads_stat=reads_stat)
error_rate_list_df = pd.DataFrame(error_rate_list)
error_rate_list_df_r1 = error_rate_list_df[error_rate_list_df['Read']==1]
assert error_rate_list_df_r1['ErrorRate'].values[0] == 0.17
error_rate_rg_list_df = pd.DataFrame(error_rate_rg_list)
error_rate_rg_list_df_r1 = error_rate_rg_list_df[error_rate_rg_list_df['Read']==1]
assert error_rate_rg_list_df_r1['ErrorRate'].values[0] == 0.17

In [None]:
def get_phix_aligned_value(
        tileDF: DataFrame,
        reads_stat: pd.DataFrame) -> Tuple[list, list]:
    try:
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        aligned_list = list()
        aligned_rg_list = list()
        for row in reads_stat.to_dict(orient='records'):
            read_id = row.get('read_id')
            index_read = row.get('index_read')
            if index_read == 'N':
                pdf = \
                    tileDF.\
                    selectExpr("cast(Lane as int)", "cast(Read as int)", "cast(Aligned as float)").\
                    filter(col("Read")==read_id).\
                    filter(col("Aligned")>0).\
                    groupBy("Lane").\
                    agg(sf.mean("Aligned").alias("Aligned")).\
                    toPandas()
                pdf['Read'] = read_id
                aligned_list.extend(pdf.to_dict(orient='records'))
                rg_pdf = \
                    tileDF.\
                    selectExpr("cast(Read as int)", "cast(Aligned as float)").\
                    filter(col("Read")==read_id).\
                    filter(col("Aligned")>0).\
                    select(sf.mean("Aligned").alias("Aligned")).\
                    toPandas()
                rg_pdf['Read'] = read_id
                aligned_rg_list.extend(rg_pdf.to_dict(orient='records'))
        return aligned_list, aligned_rg_list
    except Exception as e:
        raise ValueError(e)

In [None]:
def get_cluster_and_density_values(
        tileDF: DataFrame) -> Tuple[float, list]:
    try:
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        cluster_df = \
            tileDF.\
            selectExpr(*(f"cast({c} as decimal)" \
                         for c in ("Lane", "Read", "ClusterCount", "ClusterCountPF", "Density", "DensityPF"))).\
            groupBy("Lane", "Read").\
            agg(
                sf.sum("ClusterCount").alias("total_ClusterCount"),
                sf.sum("ClusterCountPF").alias("total_ClusterCountPF"),
                (sf.sum("ClusterCountPF")/sf.sum("ClusterCount") * 100).alias('PCT_ClusterCountPF'),
                sf.sum("Density").alias("total_Density"),
                sf.sum("DensityPF").alias("total_DensityPF")
            ).\
            toPandas()
        total_cluster_pf = \
            float(tileDF.\
                selectExpr(*(f"cast({c} as decimal)" \
                             for c in ("Lane", "Read", "ClusterCount", "ClusterCountPF", "Density", "DensityPF"))).\
                selectExpr("sum(ClusterCountPF)/sum(ClusterCount)*100 as total_cluster_pf").collect()[0]['total_cluster_pf'])
        return total_cluster_pf, cluster_df.astype(float).to_dict(orient='records')
    except Exception as e:
        raise ValueError(e)

In [None]:
t_df = \
    spark.createDataFrame(pd.DataFrame([
        {'Lane': 1, 'Tile': 1101, 'Read': 1, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010, 'Density': 2961260, 'DensityPF': 2482260},
        {'Lane': 1, 'Tile': 1101, 'Read': 4, 'ClusterCount': 4091900, 'ClusterCountPF': 3430010, 'Density': 2961260, 'DensityPF': 2482260}]))
total_cluster_pf, cluster_df_list = \
    get_cluster_and_density_values(
        tileDF=t_df)
assert f"{total_cluster_pf:.2f}" == f"{(3430010*2)/(4091900*2)*100:.2f}"

In [4]:
def generate_table_report(
        metrics_list: list,
        runinfo_xml: str,
        runparameters_xml: str) -> Tuple[list, list, dict]:
    try:
        ## read runinfo file and get reads_stat
        run_id, flowcell_id, instrument_id, lane_count, reads_stat, total_cycles = \
            read_runinfo_xml(runinfo_xml)
        ## read runparameters_xml and get flowcell_mode
        flowcell_mode = \
            read_runparameters_xml(
                runparametersXml_path=runparameters_xml)
        ## read interop metrics
        df_dict = get_interop_data_frames(metrics_list)
        ## get yield data
        avg_q30, list_pct_q30_rg, list_pct_q30_rg_lane, final_yield, list_yield_rg, list_yield_lane_rg = \
            get_q30_and_yield_values(
                qDF=df_dict.get('Q'),
                reads_stat=reads_stat,
                lane_count=lane_count)
        ## get cluster and density values
        final_cluster_pf, cluster_values = \
            get_cluster_and_density_values(
            tileDF=df_dict.get('Tile'))
        ## get intensity values
        indensity_c1_data, indensity_c1_rg_data = \
            get_intensity_cycle_1_value(
                extDF=df_dict.get('Extraction'),
                reads_stat=reads_stat)
        ## get error data
        if 'Error' in df_dict:
            errorRate_data,  errorRate_rg_data = \
                get_error_rate_value(
                    eDF=df_dict.get('Error'),
                    reads_stat=reads_stat)
        else:
            errorRate_data = \
                pd.DataFrame([], columns=['ErrorRate', 'Read', 'Lane']).\
                to_dict(orient='records')
            errorRate_rg_data = \
                pd.DataFrame([], columns=['ErrorRate', 'Read']).\
                to_dict(orient='records')
        ## get pct occupied data
        if 'ExtendedTile' in df_dict:
            pct_occupied = \
                get_total_pct_occupied(
                    tileDF=df_dict.get('Tile'),
                    etileDF=df_dict.get('ExtendedTile'))
        else:
            pct_occupied = "0.0"
        ## get phasing data
        if 'EmpiricalPhasing' in df_dict:
            phasing_data = \
                get_calculate_phasing_scores(
                    phaseDF=df_dict.get('EmpiricalPhasing'),
                    reads_stat=reads_stat)
        else:
            phasing_data = \
                pd.DataFrame([], columns=['Lane', 'Read']).\
                to_dict(orient='records')
        ## get aligned data
        aligned_data, aligned_rg_data = \
            get_phix_aligned_value(
                tileDF=df_dict.get('Tile'),
                reads_stat=reads_stat)
        ## Combine data for overview table
        # * Actual yeild
        # * Avg Q30 PCT
        # * PCT Cluster PF
        # * PCT Occupied
        # * Total Cycles
        overview_data = [{
            "Actual yield": final_yield,
            "Avg Q30 PCT": avg_q30,
            "PCT Cluster PF": final_cluster_pf,
            "PCT Occupied": pct_occupied,
            "Total Cycles": total_cycles,
            "Flowcell mode": flowcell_mode
        }]
        ## Combine data for RG overview table
        # * yield
        # * PCT Q30
        # * aligned
        # * error
        # * intensity c1
        rg_qc = \
            combine_rg_level_info_for_report(
                reads_stat=reads_stat,
                list_pct_q30_rg=list_pct_q30_rg,
                list_yield_rg=list_yield_rg,
                errorRate_rg_data=errorRate_rg_data,
                aligned_rg_data=aligned_rg_data,
                indensity_c1_rg_data=indensity_c1_rg_data)
        ## combine data for RG Lane overview table
        lane_qc = \
            combine_rg_and_lane_level_info_for_report(
                lane_count=lane_count,
                reads_stat=reads_stat,
                list_pct_q30_rg_lane=list_pct_q30_rg_lane,
                list_yield_lane_rg=list_yield_lane_rg,
                cluster_values=cluster_values,
                indensity_c1_data=indensity_c1_data,
                errorRate_data=errorRate_data,
                aligned_data=aligned_data,
                phasing_data=phasing_data)
        return overview_data, rg_qc, lane_qc, df_dict
    except Exception as e:
        raise ValueError(e)

In [None]:
def combine_rg_level_info_for_report(
        reads_stat: pd.DataFrame,
        list_pct_q30_rg: list,
        list_yield_rg: list,
        errorRate_rg_data: list,
        aligned_rg_data: list,
        indensity_c1_rg_data: list) \
        -> list:
    try:
        rg_qc = list()
        list_pct_q30_rg_df = \
            pd.DataFrame(list_pct_q30_rg)
        list_yield_rg_df = \
            pd.DataFrame(list_yield_rg)
        if len(errorRate_rg_data) > 0:
            errorRate_rg_df = \
                pd.DataFrame(errorRate_rg_data)
        else:
            errorRate_rg_df = \
                pd.DataFrame([], columns=['Read', 'ErrorRate'])
        if len(aligned_rg_data) > 0:
            aligned_rg_df = \
                pd.DataFrame(aligned_rg_data)
        else:
            aligned_rg_df = \
                pd.DataFrame([], columns=['Read', 'Aligned'])
        indensity_c1_rg_df = \
            pd.DataFrame(indensity_c1_rg_data)
        
        for row in reads_stat.to_dict(orient='records'):
            rg_dict = dict()
            read_id = row.get('read_id')
            rg_dict.update({'Read': read_id})
            ## get PCT Q30
            pct_q30 = \
                list_pct_q30_rg_df[list_pct_q30_rg_df['Read']==read_id]['PCT_Q30'].\
                values.tolist()
            if len(pct_q30) > 0:
                pct_q30 = pct_q30[0]
            else:
                pct_q30 = 0
            rg_dict.update({'PCT_Q30': pct_q30})
            ## get yield
            total_yeild = \
                list_yield_rg_df[list_yield_rg_df['Read']==read_id]['Yield'].\
                values.tolist()
            if len(total_yeild)> 0:
                total_yeild = total_yeild[0]
            else:
                total_yeild = 0
            rg_dict.update({'Yield': total_yeild})
            ## get error rate
            error_rate = \
                errorRate_rg_df[errorRate_rg_df['Read']==read_id]['ErrorRate'].\
                values.tolist()
            if len(error_rate) > 0:
                error_rate = error_rate[0]
            else:
                error_rate = 0
            rg_dict.update({'ErrorRate': error_rate})
            ## get aligned rate
            aligned = \
                aligned_rg_df[aligned_rg_df['Read']==read_id]['Aligned'].\
                values.tolist()
            if len(aligned) > 0:
                aligned = aligned[0]
            else:
                aligned = 0
            rg_dict.update({'Aligned': aligned})
            ## get intensity c1
            indensity_c1 = \
                indensity_c1_rg_df[indensity_c1_rg_df['Read']==read_id]['Intensity_cycle_1'].\
                values.tolist()
            if len(indensity_c1) > 0:
                indensity_c1 = indensity_c1[0]
            else:
                indensity_c1 = 0
            rg_dict.update({'Intensity_cycle_1': indensity_c1})
            rg_qc.append(rg_dict)
        return rg_qc
    except Exception as e:
        raise ValueError(e)

In [None]:
def combine_rg_and_lane_level_info_for_report(
        lane_count: int,
        reads_stat: pd.DataFrame,
        list_pct_q30_rg_lane: list,
        list_yield_lane_rg: list,
        cluster_values: list,
        indensity_c1_data: list,
        errorRate_data: list,
        aligned_data: list,
        phasing_data: list) \
        -> dict:
    try:
        lane_qc = dict()
        list_pct_q30_rg_lane_df = \
            pd.DataFrame(list_pct_q30_rg_lane)
        list_yield_lane_rg_df = \
            pd.DataFrame(list_yield_lane_rg)
        cluster_values_df = \
            pd.DataFrame(cluster_values)
        indensity_c1_df = \
            pd.DataFrame(indensity_c1_data)
        if len(errorRate_data) > 0:
            errorRate_df = \
                pd.DataFrame(errorRate_data)
        else:
            errorRate_df = \
                pd.DataFrame([], columns=['ErrorRate', 'Read', 'Lane'])
        if len(aligned_data) > 0 :
            aligned_df = \
                pd.DataFrame(aligned_data)
        else:
            aligned_df = \
                pd.DataFrame([], columns=['Lane', 'Read', 'Aligned'])
        phasing_df = \
            pd.DataFrame(phasing_data)
        for lane_id in range(1, lane_count+1):
            lane_qc_list = list()
            for row in reads_stat.to_dict(orient='records'):
                rg_dict = dict()
                read_id = row.get('read_id')
                rg_dict.update({'Read': read_id})
                ## get pct q30
                pct_q30 = \
                    list_pct_q30_rg_lane_df[
                        (list_pct_q30_rg_lane_df['Lane']==lane_id)& \
                        (list_pct_q30_rg_lane_df['Read']==read_id)]['PCT_Q30'].\
                    values.tolist()
                if len(pct_q30) > 0:
                    pct_q30 = pct_q30[0]
                else:
                    pct_q30 = 0
                rg_dict.update({'PCT_Q30': pct_q30})
                ## get yield
                total_yield = \
                list_yield_lane_rg_df[
                    (list_yield_lane_rg_df['Read']==read_id)& \
                    (list_yield_lane_rg_df['Lane']==lane_id)]['Yield'].\
                values.tolist()
                if len(total_yield) > 0:
                    total_yield = total_yield[0]
                else:
                    total_yield = 0
                rg_dict.update({'Yield': total_yield})
                ## get cluster pf values
                cluster_lists = \
                    cluster_values_df[
                        (cluster_values_df['Read'].astype(int)==read_id)& \
                        (cluster_values_df['Lane'].astype(int)==lane_id)]
                if len(cluster_lists.index) > 0:
                    pct_clusterCountPF = \
                        cluster_lists['PCT_ClusterCountPF'].values.tolist()[0]
                else:
                    pct_clusterCountPF = 0
                rg_dict.update({'PCT_ClusterCountPF': pct_clusterCountPF})
                ## intensity c1
                indensity_c1 = \
                    indensity_c1_df[
                        (indensity_c1_df['Lane'].astype(int)==lane_id)& \
                        (indensity_c1_df['Read'].astype(int)==read_id)]['Intensity_cycle_1'].\
                    values.tolist()
                if len(indensity_c1) > 0:
                    indensity_c1 = indensity_c1[0]
                else:
                    indensity_c1 = 0
                rg_dict.update({'Intensity_cycle_1': indensity_c1})
                ## error rate
                errorRate = \
                    errorRate_df[
                        (errorRate_df['Lane'].astype(int)==lane_id)& \
                        (errorRate_df['Read'].astype(int)==read_id)]['ErrorRate'].\
                    values.tolist()
                if len(errorRate) > 0:
                    errorRate = errorRate[0]
                else:
                    errorRate = 0
                rg_dict.update({'ErrorRate': errorRate})
                ## get aligned data
                aligned = \
                    aligned_df[
                        (aligned_df['Lane']==lane_id)& \
                        (aligned_df['Read']==read_id)]['Aligned'].\
                    values.tolist()
                if len(aligned) > 0:
                    aligned = aligned[0]
                else:
                    aligned = 0
                rg_dict.update({'Aligned': aligned})
                ## phasing data
                phasing_slope = 0
                phasing_offset = 0
                prephasing_slope = 0
                prephasing_offset = 0
                if len(phasing_df.index) > 0:
                    phasing = \
                        phasing_df[
                            (phasing_df['Read']==read_id)& \
                            (phasing_df['Lane']==lane_id)]
                    if len(phasing.index) > 0:
                        phasing_slope = phasing["Phasing_slope"].values.tolist()[0]
                        phasing_offset = phasing["Phasing_offset"].values.tolist()[0]
                        prephasing_slope = phasing["Prephasing_slope"].values.tolist()[0]
                        prephasing_offset = phasing["Prephasing_offset"].values.tolist()[0]
                rg_dict.update({'Phasing_slope': phasing_slope})
                rg_dict.update({'Phasing_offset': phasing_offset})
                rg_dict.update({'Prephasing_slope': prephasing_slope})
                rg_dict.update({'Prephasing_offset': prephasing_offset})
                lane_qc_list.append(rg_dict)
            lane_qc.update({lane_id: lane_qc_list})
        return lane_qc
    except Exception as e:
        raise ValueError(e)

In [None]:
def generatePCA_for_tiles(df: pd.DataFrame) -> tuple[alt.Chart, alt.Chart]:
    try:
        df['Lane'] = df['Lane'].astype(int)
        df['PCT_ClusterCountPF'] = df['PCT_ClusterCountPF'].astype(float)
        df['PCT_DensityPF'] = df['PCT_DensityPF'].astype(float)
        df['PCT_Q30'] = df['PCT_Q30'].astype(float)
        df['PCT_Occupied'] = df['PCT_Occupied'].astype(float)
        df['slope_p'] = df['slope_p'].astype(float)
        df['mean_CalledCount_A'] = df['mean_CalledCount_A'].astype(int)
        df['mean_CalledCount_T'] = df['mean_CalledCount_T'].astype(int)
        df['mean_CalledCount_G'] = df['mean_CalledCount_G'].astype(int)
        df['mean_CalledCount_C'] = df['mean_CalledCount_C'].astype(int)
        df['intensity_c1'] = df['intensity_c1'].astype(int)
        sub_cols = [
            'PCT_ClusterCountPF', 'PCT_DensityPF', 'mean_CalledCount_A', 
            'mean_CalledCount_T', 'mean_CalledCount_G', 'mean_CalledCount_C', 
            'PCT_Q30', 'mean_ErrorRate', 'PCT_Occupied', 'intensity_c1', 
            'slope_p', 'offset_p', 'slope_pr', 'offset_pr']
        sub_df = df[sub_cols].copy()
        X_reduced = \
            PCA(n_components=2).fit(sub_df).fit_transform(sub_df)
        pca_result = pd.DataFrame(X_reduced, columns=['pca1', 'pca2'])
        sub_df['ClusterCountPF_status'] = np.where(df['PCT_ClusterCountPF'] < 0.7, 'Low', 'High')
        pca_result['Lane'] = df['Lane'].astype(str)
        pca_result['Tile'] = df['Tile'].astype(str)
        pca_result['ClusterCountPF_status'] = sub_df['ClusterCountPF_status']
        chart1 = \
            alt.Chart(pca_result, title="PCA plot - color by lane").mark_point().encode(
                x='pca1:Q',
                y='pca2:Q',
                color='Lane',
                tooltip=['Lane:N', 'Tile:N']).\
            interactive().\
            properties(
                width=900, height=400)
        chart2 = \
            alt.Chart(pca_result, title="PCA plot - color by ClusterCountPF status").mark_point().encode(
                x='pca1:Q',
                y='pca2:Q',
                color='ClusterCountPF_status:N',
                tooltip=['Lane:N', 'Tile:N', 'ClusterCountPF_status:N']).\
            interactive().\
            properties(
                width=900, height=400)
        return chart1, chart2
    except Exception as e:
        raise ValueError(e)

In [None]:
def add_hist_axis_from_tile_id(s: pd.Series)-> pd.Series:
    tile = s['Tile']
    s['h_x'] = int(str(int(tile))[0])
    s['h_y'] = int(str(int(tile))[1:])
    return s

def get_tile_plots(df: pd.DataFrame) -> \
        Tuple[dict[str, list[alt.Chart]], dict[str, list[alt.Chart]], dict[str, list[alt.Chart]], dict[str, list[alt.Chart]], dict[str, list[alt.Chart]]]:
    try:
        df = \
            df.astype({
                'Lane': int,
                'Tile': int,
                'PCT_ClusterCountPF': float,
                'PCT_DensityPF': float,
                'PCT_Q30': float,
                'PCT_Occupied': float,
                'slope_p': float,
                'mean_CalledCount_A': int,
                'mean_CalledCount_T': int,
                'mean_CalledCount_G': int,
                'mean_CalledCount_C': int,
                'intensity_c1': int})
        df = \
            df.apply(lambda s: add_hist_axis_from_tile_id(s), axis=1)
        ## get PCT_ClusterCountPF plots
        tilePCTClusterCountPFPlots = list()
        for lane, l_data in df.groupby('Lane'):
            chart1 = \
                alt.Chart(l_data, title=f"Histogram plot for % ClusterCountPF per tile - Lane {int(lane)}").mark_rect().encode(
                    x=alt.X('h_y:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    y=alt.Y('h_x:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    color=alt.Color('PCT_ClusterCountPF:Q').scale(scheme='purpleblue'),
                    tooltip=['Lane:N', 'Tile:O', 'PCT_ClusterCountPF:Q']
                ).configure_view(
                    step=13,
                    strokeWidth=0
                ).configure_axis(
                    domain=False
                ).properties(
                    width=1080,
                    height=100)
            tilePCTClusterCountPFPlots.append(chart1)
        ## get PCT_DensityPF plots
        tilePCTDensityPFPlots = list()
        for lane, l_data in df.groupby('Lane'):
            chart2 = \
                alt.Chart(l_data, title=f"Histogram plot for % DensityPF per tile - Lane {int(lane)}").mark_rect().encode(
                    x=alt.X('h_y:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    y=alt.Y('h_x:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    color=alt.Color('PCT_DensityPF:Q').scale(scheme='purpleblue'),
                    tooltip=['Lane:N', 'Tile:O', 'PCT_DensityPF:Q']
                ).configure_view(
                    step=13,
                    strokeWidth=0
                ).configure_axis(
                    domain=False
                ).properties(
                    width=1080,
                    height=100)
            tilePCTDensityPFPlots.append(chart2)
        ## get PCT_Q30 plots
        tilePCTQ30Plots = list()
        for lane, l_data in df.groupby('Lane'):
            chart3 = \
                alt.Chart(l_data, title=f"Histogram plot for % Q30 per tile - Lane {int(lane)}").mark_rect().encode(
                    x=alt.X('h_y:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    y=alt.Y('h_x:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    color=alt.Color('PCT_Q30:Q').scale(scheme='purpleblue'),
                    tooltip=['Lane:N', 'Tile:O', 'PCT_Q30:Q']
                ).configure_view(
                    step=13,
                    strokeWidth=0
                ).configure_axis(
                    domain=False
                ).properties(
                    width=1080,
                    height=100)
            tilePCTQ30Plots.append(chart3)
        ## get PCT_Occupied plots
        tilePCTOccupiedPlots = list()
        for lane, l_data in df.groupby('Lane'):
            chart4 = \
                alt.Chart(l_data, title=f"Histogram plot for % Occupied per tile - Lane {int(lane)}").mark_rect().encode(
                    x=alt.X('h_y:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    y=alt.Y('h_x:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    color=alt.Color('PCT_Occupied:Q').scale(scheme='purpleblue'),
                    tooltip=['Lane:N', 'Tile:O','PCT_Occupied:Q']
                ).configure_view(
                    step=13,
                    strokeWidth=0
                ).configure_axis(
                    domain=False
                ).properties(
                    width=1080,
                    height=100)
            tilePCTOccupiedPlots.append(chart4)
        ## get intensity_c1 plots
        tileIntensityC1Plots = list()
        for lane, l_data in df.groupby('Lane'):
            chart5 = \
                alt.Chart(l_data, title=f"Histogram plot for Intensity cycle 1 per tile - Lane {int(lane)}").mark_rect().encode(
                    x=alt.X('h_y:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    y=alt.Y('h_x:O', axis=alt.Axis(labels=False, ticks=False, title=None)),
                    color=alt.Color('intensity_c1:Q').scale(scheme='purpleblue'),
                    tooltip=['Lane:N', 'Tile:O', 'intensity_c1:Q']
                ).configure_view(
                    step=13,
                    strokeWidth=0
                ).configure_axis(
                    domain=False
                ).properties(
                    width=1080,
                    height=100)
            tileIntensityC1Plots.append(chart5)
        return  {HTML("<h3>PCT ClusterCountPF</h3>"): tilePCTClusterCountPFPlots}, \
                {HTML("<h3>PCT DensityPF</h3>"): tilePCTDensityPFPlots}, \
                {HTML("<h3>PCT Q30</h3>"): tilePCTQ30Plots}, \
                {HTML("<h3>PCT Occupied</h3>"): tilePCTOccupiedPlots}, \
                {HTML("<h3>Intensity Cycle 1</h3>"): tileIntensityC1Plots}
    except Exception as e:
        raise ValueError(e)

In [None]:
def generate_intensity_plots(extDF: DataFrame) -> list:
    try:
        chart_list = list()
        extDF = extDF.na.fill('0')
        extDF = extDF.na.replace('nan', '0')
        extDF_calc = \
            extDF.\
            selectExpr(
                "cast(Lane as int)",
                "cast(Cycle as int)",
                *(f"cast({c} as long)"
                      for c in extDF.columns
                          if c.startswith("MaxIntensity_"))).\
            groupBy("Lane", "Cycle").\
            agg(
                *(sf.mean(c).alias(c)
                      for c in extDF.columns
                          if c.startswith("MaxIntensity_")))
        int_cols = [
            c for c in extDF_calc.columns
                if c.startswith("MaxIntensity_")]
        extDF_calc_pdf = \
            extDF_calc.toPandas()
        for c in int_cols:
            chart = \
                alt.Chart(extDF_calc_pdf, title=f"Intensity plots for all lanes for channel {c}").mark_line(point=True).encode(
                    x='Cycle:O',
                    y=f'{c}:Q',
                    color='Lane:N',
                    tooltip=['Lane:N', 'Cycle:O', f'{c}:Q']
                ).properties(
                    width=900,
                    height=400
                ).interactive()
            chart_list.append(chart)
        return chart_list
    except Exception as e:
        raise ValueError(e)

In [None]:
def generate_box_plot(tileDF: DataFrame) -> Tuple[alt.Chart, alt.Chart]:
    try:
        tileDF = tileDF.na.fill('0')
        tileDF = tileDF.na.replace('nan', '0')
        tileDF_calc = \
            tileDF.\
            selectExpr(
                "cast(Lane as int)",
                "cast(Tile as int)",
                "cast(ClusterCount as decimal)",
                "cast(ClusterCountPF as decimal)",
                "cast(Density as decimal)",
                "cast(DensityPF as decimal)").\
            groupBy("Lane", "Tile").\
            agg(
                sf.sum("ClusterCount").alias("ClusterCount"),
                sf.sum("ClusterCountPF").alias("ClusterCountPF"),
                sf.sum("Density").alias("Density"),
                sf.sum("DensityPF").alias("DensityPF"),
                (sf.sum("ClusterCountPF")/sf.sum("ClusterCount")*100).\
                    alias("PCT_ClusterCountPF"),
                (sf.sum("DensityPF")/sf.sum("Density")*100).\
                    alias("PCT_DensityPF"))
        tileDF_calc_pdf = \
            tileDF_calc.toPandas()
        tileDF_calc_pdf = \
            tileDF_calc_pdf[['Lane', "Tile", 'PCT_ClusterCountPF', 'PCT_DensityPF']]
        tileDF_calc_pdf['PCT_ClusterCountPF'] = \
            tileDF_calc_pdf['PCT_ClusterCountPF'].astype(float)
        tileDF_calc_pdf['PCT_DensityPF'] = \
            tileDF_calc_pdf['PCT_DensityPF'].astype(float)
        chart_ClusterCountPF = \
            alt.Chart(tileDF_calc_pdf, title="Box plot showing % ClusterCountPF for all lanes").mark_boxplot().encode(
                x='Lane:N',
                y=alt.Y('PCT_ClusterCountPF:Q').title('% ClusterCountPF'),
                color='Lane:N',
                tooltip=['Lane:N', "Tile:N", 'PCT_ClusterCountPF:Q']
            ).\
            properties(
                width=900,
                height=400
            ).interactive()
        chart_DensityPF = \
            alt.Chart(tileDF_calc_pdf, title="Box plot showing % DensityPF for all lanes").mark_boxplot().encode(
                x='Lane:N',
                y=alt.Y('PCT_DensityPF:Q').title("% DensityPF"),
                color='Lane:N',
                tooltip=['Lane:N', "Tile:N", 'PCT_DensityPF:Q']
            ).properties(
                width=900,
                height=400
            ).interactive()
        return chart_ClusterCountPF, chart_DensityPF
    except Exception as e:
        raise ValueError(e)

In [None]:
def plot_pct_occupied_vs_pct_pf(tileDF: DataFrame, extTileDF: DataFrame) -> alt.Chart:
    try:
        tiles_pct_df = \
            get_occupied_pct_per_tile(
                tileDF=tileDF,
                extTileDF=extTileDF)
        tiles_pct_df_pdf = tiles_pct_df.toPandas()
        tiles_pct_df_pdf['PCT_Occupied'] = \
            tiles_pct_df_pdf['PCT_Occupied'].astype(float)
        tiles_pct_df_pdf['PCT_ClusterCountPF'] = \
            tiles_pct_df_pdf['PCT_ClusterCountPF'].astype(float)
        options = \
            tiles_pct_df_pdf['Lane'].\
            drop_duplicates().\
            values.tolist()
        selection = \
            alt.selection_multi(fields=['Lane'])
        color = \
            alt.condition(
                selection,
                alt.Color('Lane:N'),
                alt.value('lightgray'))
        make_selector = \
            alt.Chart(tiles_pct_df_pdf).\
                mark_rect().\
                encode(x=alt.X('Lane:N').title('Lane'), color=color).\
                add_params(selection)
        point = \
            alt.Chart(tiles_pct_df_pdf, title="% Occupied vs % PF for all lanes").mark_point().encode(
                x=alt.X('PCT_Occupied:Q').title('% Occupied'),
                y=alt.Y('PCT_ClusterCountPF:Q').title('% PF'),
                color=alt.Color('Lane:N').scale(domain=options).title('lane'),
                tooltip=['Lane:N', "Tile:N", 'PCT_Occupied:Q', 'PCT_ClusterCountPF:Q']).\
            add_params(
                selection
            ).transform_filter(
                selection
            ).\
            properties(
                width=960, height=400)
        plot = alt.vconcat(make_selector, point)
        return plot
    except Exception as e:
        raise ValueError(e)

In [None]:
def get_all_plots(
        mergedTileDF: DataFrame,
        extDF: DataFrame,
        tileDF: DataFrame,
        extTileDF: DataFrame) -> dict:
    try:
        all_plots = dict()
        ## get pca plots
        mergedDF_pdf = mergedTileDF.toPandas()
        pca_plot1, pca_plot2 = \
            generatePCA_for_tiles(df=mergedDF_pdf)
        all_plots.update(
            {'pca': [pca_plot1, pca_plot2]})
        ## get tile plots
        tilePCTClusterCountPFPlots, tilePCTDensityPFPlots, tilePCTQ30Plots, tilePCTOccupiedPlots, tileIntensityC1Plots = \
            get_tile_plots(df=mergedDF_pdf)
        all_plots.update(
            {'tile': [tilePCTClusterCountPFPlots, tilePCTDensityPFPlots, tilePCTQ30Plots, tilePCTOccupiedPlots, tileIntensityC1Plots]})
        ## get intensity plots
        chart_list = \
            generate_intensity_plots(
                extDF=extDF)
        all_plots.update(
            {'intensity': chart_list})
        ## box plot
        chart_ClusterCountPF, chart_DensityPF = \
            generate_box_plot(tileDF=tileDF)
        all_plots.update(
            {'boxplot': [chart_ClusterCountPF, chart_DensityPF]})
        ## occupied vs pf
        occu_plot = \
            plot_pct_occupied_vs_pct_pf(
                tileDF=tileDF,
                extTileDF=extTileDF)
        all_plots.update(
            {'occupied': [occu_plot,]})
        return all_plots
    except Exception as e:
        raise ValueError(e)

In [None]:
overview_data, rg_qc, lane_qc, df_dict = \
    generate_table_report(
        metrics_list=metrics_list,
        runinfo_xml=runinfo_xml,
        runparameters_xml=runparameters_xml)

mergedDF = \
    read_interop_metrics_and_merge_per_tile(
        df_dict=df_dict,
        runinfo_xml=runinfo_xml)

all_plots = \
    get_all_plots(
        mergedTileDF=mergedDF,
        extDF=df_dict.get("Extraction"),
        tileDF=df_dict.get("Tile"),
        extTileDF=df_dict.get("ExtendedTile"))

## read runinfo file and get reads_stat
run_id, flowcell_id, instrument_id, lane_count, reads_stat, _ = \
    read_runinfo_xml(runinfo_xml)
## save data
## overview_data: save as csv
pd.DataFrame(overview_data).\
    to_csv(
        overview_csv_output,
        index=False)
## mergedDF: save as parquet
mergedDF.\
    withColumn("Run_id", lit(run_id)).\
    write.\
    parquet(
        tile_parquet_output)

## Run information

In [None]:
display(HTML(f"""<ul>
    <li><b>Run id:</b> {run_id}</li>
    <li><b>Lane counts:</b> {lane_count}</li>
    <li><b>Run Cycles:</b></li>
</ul>"""))
display(HTML(reads_stat.to_html(index=False)))

###  Run overview

In [None]:
HTML(pd.DataFrame(overview_data).to_html(index=False))

### Run stats for each read groups

In [None]:
HTML(pd.DataFrame(rg_qc).to_html(index=False))

### Run stats for each lanes

In [None]:
for lane_id, l_data in lane_qc.items():
    display(HTML(f"<h2>Lane: {lane_id}</h2>"))
    display(HTML(pd.DataFrame(l_data).to_html(index=False)))

## Plots
### Tile plots

In [None]:
for p in all_plots.get("tile"):
    for title, plots in p.items():
        display(title)
        for i in plots:
            display(i)

### Intensity plots

In [None]:
for p in all_plots.get("intensity"):
    display(p)

### Box plots for % ClusterCountPF and % DensityPF

In [None]:
for p in all_plots.get("boxplot"):
    display(p)

### PCA plots

In [None]:
for p in all_plots.get("pca"):
    display(p)

### Scatter plot for % Occupied vs % PF

In [None]:
for p in all_plots.get("occupied"):
    display(p)