# BCLConvert De-multiplexing Report 

* __Notebook version__: `v0.0.6`
* __Created by:__ `Imperial BRC Genomics Facility`
* __Maintained by:__ `Imperial BRC Genomics Facility`
* __Docker image:__ `imperialgenomicsfacility/interop-notebook-image:release-v0.0.4`
* __Github repository:__ [imperial-genomics-facility/demultiplexing_report](https://github.com/imperial-genomics-facility/demultiplexing_report)
* __Contact us:__ [Imperial BRC Genomics Facility](https://www.imperial.ac.uk/medicine/research-and-impact/facilities/genomics-facility/contact-us/)
* __License:__ [Apache License 2.0](https://github.com/imperial-genomics-facility/interop-notebook-image/blob/main/LICENSE)
* __Created on:__ {{ DATE_TAG }}
* __Sequencing run id:__ {{ SEQRUN_IGF_ID }}


In [None]:
## Load library and generate plots

import numpy as np
import pandas as pd
import warnings
from IPython.display import HTML
from illumina.report_generator_bclconvert import (
    get_demult_report_and_plots_for_bclconvert,
    get_samplesheet_records)
warnings.filterwarnings("ignore")

In [None]:
(flowcell_summary_data_plot, flowcell_project_summary_plot, \
 merged_df, flowcell_project_summary_table, sample_dist_plots, \
 undetermined_plots, undetermined_table, combined_ihop_df, \
 hop_plot) = \
    get_demult_report_and_plots_for_bclconvert(
        reports_dir='{{ REPORTS_DIR }}',
        run_dir='{{ RUN_DIR }}')

## Flowcell total reads vs passing filter reads

In [None]:
display(flowcell_summary_data_plot)

## Project summary plot

In [None]:
display(flowcell_project_summary_plot)

## Project summary for lane

In [None]:
HTML(flowcell_project_summary_table.to_html(index=False))

## Hamming distance

In [None]:
## wikipedia code
def hamming_distance(s1: str, s2: str) -> int:
    """Return the Hamming distance between equal-length sequences."""
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length.")
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

def calculate_min_hamming_distance(samplesheet_path: str) -> list:
    """Calculate min Hamming distance for a run"""
    try:
        samplesheet_data = \
            get_samplesheet_records(samplesheets=[samplesheet_path,])
        index_columns = ['index']
        if 'index2' in samplesheet_data.columns:
            samplesheet_data['index_length'] = \
                samplesheet_data.\
                apply(lambda x: len(x['index']) + len(x['index2']), axis=1)
            index_columns.append('index2')
        else:
            samplesheet_data['index_length'] = \
                samplesheet_data.\
                apply(lambda x: len(x['index']), axis=1)
        if 'Lane' in samplesheet_data.columns:
            group_columns = [
                'Lane', 'Sample_Project', 'index_length', 'Description']
        else:
            group_columns = [
                'Sample_Project', 'index_length', 'Description']
        output_rows = list()
        for grp_name, g_data in samplesheet_data.groupby(group_columns):
            min_hamming_dist = 10
            index_data = \
                g_data[index_columns].to_dict(orient='records')
            for i in range(0, len(index_data) - 1):
                for j in range(i+1, len(index_data) - 1):
                    if i != j:
                        index_i = index_data[i].get('index')
                        index_j = index_data[j].get('index')
                        hamming_dist1 = \
                            hamming_distance(s1=index_i, s2=index_j)
                        if min_hamming_dist > hamming_dist1:
                            min_hamming_dist = hamming_dist1
                        if 'index2' in index_columns:
                            index2_i = index_data[i].get('index2')
                            index2_j = index_data[j].get('index2')
                            hamming_dist2 = \
                                hamming_distance(s1=index2_i, s2=index2_j)
                            if min_hamming_dist > hamming_dist2:
                                min_hamming_dist = hamming_dist2
            group_row = dict(zip(group_columns, grp_name))
            group_row.update({'min_hamming_distance': min_hamming_dist})
            output_rows.append(group_row)
        return output_rows
    except Exception as e:
        raise ValueError(e)

output_rows = \
    calculate_min_hamming_distance(
        samplesheet_path='{{ REPORTS_DIR }}/SampleSheet.csv')

def style_low_hamming_distance(s: pd.Series, props: str = '', cut_off: int = 3) -> pd.Series:
    return np.where(s < cut_off, props, '')

html = \
    pd.DataFrame(output_rows).style.\
    apply(style_low_hamming_distance, props='color:red;', cut_off=3, axis=0, subset=['min_hamming_distance',]).\
    apply(style_low_hamming_distance, props='background-color:#ffffb3;', cut_off=3, axis=0, subset=['min_hamming_distance',]).\
    hide_index().to_html()
HTML(html)

## Sample read counts

A list of samples with index barcodes and read count information can be found here. Please note that this table is hidden by default.

In [None]:
def style_low_read(s: pd.Series, props: str = '', cut_off: int = 500) -> pd.Series:
    return np.where(s <= cut_off, props, '')

html = merged_df.style.\
     apply(style_low_read, props='color:red;', cut_off=500, axis=0, subset=['# Reads',]).\
     apply(style_low_read, props='background-color:#ffffb3;', cut_off=500, axis=0, subset=['# Reads',]).\
     hide_index().to_html()
html = '<details><summary>Click to expand sample read count table</summary>' + html + '</details>'
HTML(html)

## Sample read count bar plot for lane

In [None]:
for lane_id, p in sample_dist_plots.items():
    print('Lane {0}'.format(lane_id))
    display(p)

## Undetermined reads

A list of undetermined barcodes with read count information can be found here. This table is hidden by default.

In [None]:
html = undetermined_table.to_html(index=False)
html = '<details><summary>Click to expand undetermined read count table</summary>' + html + '</details>'
HTML(html)

## Undetermined read count bar plot for lane

In [None]:
for lane_id, p in undetermined_plots.items():
    print('Lane {0}'.format(lane_id))
    display(p)

## Index hopping summary

In [None]:
HTML(combined_ihop_df.to_html(index=False))

## Index hopping bar plot for lane

In [None]:
hop_plot