# BCLConvert De-multiplexing Report 

* __Notebook version__: `v0.0.7`
* __Created by:__ `Imperial BRC Genomics Facility`
* __Maintained by:__ `Imperial BRC Genomics Facility`
* __Docker image:__ `imperialgenomicsfacility/igf-dockerfiles/bclconvert_reports:v5`
* __Github repository:__ [imperial-genomics-facility/igf-dockerfiles/bclconvert_reports](https://github.com/imperial-genomics-facility/igf-dockerfiles)
* __Contact us:__ [Imperial BRC Genomics Facility](https://www.imperial.ac.uk/medicine/research-and-impact/facilities/genomics-facility/contact-us/)
* __License:__ [Apache License 2.0](https://github.com/imperial-genomics-facility/interop-notebook-image/blob/main/LICENSE)
* __Created on:__ {{ DATE_TAG }}
* __Sequencing run id:__ {{ SEQRUN_IGF_ID }}


In [None]:
## Load library and generate plots

import os
import subprocess
import tempfile
import numpy as np
import pandas as pd
from io import StringIO
from iplotter import ChartJSPlotter
from iplotter import GCPlotter
from shutil import copy2
from shutil import copytree
import warnings
from IPython.display import HTML
warnings.filterwarnings("ignore")


def combine_bclconvert_demultiplex_stats_csv(demultiplex_stats_list: list) \
        -> pd.DataFrame:
    try:
        merged_df = pd.DataFrame()
        for entry in demultiplex_stats_list:
            if not os.path.exists(entry):
                raise IOError('Missing file {0}'.format(entry))
            df = pd.read_csv(entry)
            df['Index'].fillna('', inplace=True)
            if len(merged_df.index) == 0:
                merged_df = df.copy()
            else:
                merged_df = \
                    pd.concat(
                        [merged_df, df],
                        ignore_index=True)
        expected_columns = [
            'Lane', 'SampleID', 'Index', '# Reads',
            '# Perfect Index Reads', '# One Mismatch Index Reads',
            '# Two Mismatch Index Reads', '% Reads',
            '% Perfect Index Reads', '% One Mismatch Index Reads',
            '% Two Mismatch Index Reads']
        for i in expected_columns:
            if i not in merged_df.columns:
                raise KeyError(
                    "Missing column {0} in demultiplex_stats".\
                    format(i))
        combined_df = \
            merged_df.\
            groupby([
                'Lane', 'SampleID', 'Index']).\
            agg({
                '# Reads': np.sum,
                '# Perfect Index Reads': np.sum,
                '# One Mismatch Index Reads': np.sum,
                '# Two Mismatch Index Reads': np.sum,
                '% Reads': np.mean,
                '% Perfect Index Reads': np.mean,
                '% One Mismatch Index Reads': np.mean,
                '% Two Mismatch Index Reads': np.mean})
        combined_df.reset_index(inplace=True)
        return combined_df
    except Exception as e:
        raise ValueError(
                'Failed to combine Demultiplex_Stats csv files, error: {0}'.\
                format(e))

def combine_bclconvert_quality_metrics_csv(quality_metrics_list: list) \
        -> pd.DataFrame:
    try:
        merged_df = pd.DataFrame()
        for entry in quality_metrics_list:
            if not os.path.exists(entry):
                raise IOError('Missing file {0}'.format(entry))
            df = pd.read_csv(entry)
            if len(merged_df.index) == 0:
                merged_df = df.copy()
            else:
                merged_df = \
                    pd.concat(
                        [merged_df, df],
                        ignore_index=True)
        expected_columns = [
            "Lane", "SampleID", "index", "index2", "ReadNumber",
            "Yield", "YieldQ30", "QualityScoreSum", "% Q30"]
        for i in expected_columns:
            if i not in merged_df.columns:
                raise KeyError(
                    "Missing column {0} in quality_metrics".\
                    format(i))
        merged_df['index2'] = \
            merged_df['index2'].fillna('')
        combined_df = \
            merged_df.\
            groupby([
                "Lane", "SampleID", "index", "index2", "ReadNumber"]).\
            agg({
                "Yield": np.sum,
                "YieldQ30": np.sum,
                "QualityScoreSum": np.sum,
                "Mean Quality Score (PF)": np.mean,
                "% Q30": np.mean})
        combined_df.reset_index(inplace=True)
        return combined_df
    except Exception as e:
        raise ValueError(
                'Failed to combine Quality_Metrics csv files, error: {0}'.\
                format(e))

def combine_bclconvert_top_unknown_barcodes_csv(
        top_unknown_barcodes_list: list) \
        -> pd.DataFrame:
    try:
        merged_df = pd.DataFrame()
        for entry in top_unknown_barcodes_list:
            if not os.path.exists(entry):
                raise IOError('Missing file {0}'.format(entry))
            df = pd.read_csv(entry)
            if len(merged_df.index) == 0:
                merged_df = df.copy()
            else:
                merged_df = \
                    pd.concat(
                        [merged_df, df],
                        ignore_index=True)
        expected_columns = [
            'Lane', 'index', 'index2', '# Reads',
            '% of Unknown Barcodes', '% of All Reads']
        for i in expected_columns:
            if i not in merged_df.columns:
                raise KeyError(
                    "Missing column {0} in quality_metrics".\
                    format(i))
        merged_df['index2'] = \
            merged_df['index2'].fillna('')
        combined_df = \
            merged_df.\
            groupby([
                'Lane', 'index', 'index2']).\
            agg({
                '# Reads': np.sum,
                '% of Unknown Barcodes': np.mean,
                '% of All Reads': np.mean})
        combined_df.reset_index(inplace=True)
        combined_df = \
            combined_df.\
            sort_values('# Reads', ascending=False).\
            head(200).\
            sort_values([
                'Lane', '# Reads'], 
                ascending=False)
        return combined_df
    except Exception as e:
        raise ValueError(
                'Failed to combine Top_Unknown_Barcodes csv files, error: {0}'.\
                format(e))

def combine_bclconvert_index_hopping_counts_csv(
        index_hopping_counts_list: list) \
        -> pd.DataFrame:
    try:
        merged_df = pd.DataFrame()
        for entry in index_hopping_counts_list:
            if not os.path.exists(entry):
                raise IOError('Missing file {0}'.format(entry))
            df = pd.read_csv(entry)
            if len(merged_df.index) == 0:
                merged_df = df.copy()
            else:
                merged_df = \
                    pd.concat(
                        [merged_df, df],
                        ignore_index=True)
        expected_columns = [
            'Lane', 'SampleID', 'index', 'index2', '# Reads',
            '% of Hopped Reads', '% of All Reads']
        for i in expected_columns:
            if i not in merged_df.columns:
                raise KeyError(
                    "Missing column {0} in quality_metrics".\
                    format(i))
        merged_df['SampleID'] = \
            merged_df['SampleID'].fillna('UNKNOWN')
        merged_df['Sample_Project'] = \
            merged_df['Sample_Project'].fillna('UNKNOWN')
        merged_df['% of Hopped Reads'] = \
            merged_df['% of Hopped Reads'].fillna(0)
        merged_df.dropna(inplace=True)
        filt_merged_df = \
            merged_df[(merged_df['% of Hopped Reads'] > 0.005) & (merged_df['SampleID'] != 'UNKNOWN')]
        combined_df = \
            filt_merged_df.\
            groupby([
                'Lane', 'SampleID', 'index', 'index2']).\
            agg({
                '# Reads': np.sum,
                '% of Hopped Reads': np.mean,
                '% of All Reads': np.mean})
        combined_df.reset_index(inplace=True)
        return combined_df
    except Exception as e:
        raise ValueError(
                'Failed to combine Index_Hopping_Counts csv files, error: {0}'.\
                format(e))

def get_samplesheet_records(samplesheets: list) \
        -> pd.DataFrame:
    try:
        all_samplesheet_data = pd.DataFrame()
        for f in samplesheets:
            data_section = False
            samplesheet_data_list = list()
            with open(f, 'r') as fp:
                for i in fp:
                    i = i.strip()
                    if i == '':
                        continue
                    if i.startswith('['):
                        data_section = False
                        if i.startswith('[Data]') or \
                           i.startswith('[data]') or \
                           i.startswith('[BCLConvert_Data]'):
                            data_section = True
                            continue
                    if data_section:
                        samplesheet_data_list.\
                            append(i.split(','))
                samplesheet = \
                    pd.DataFrame(
                        samplesheet_data_list[1:],
                        columns=samplesheet_data_list[0])
                all_samplesheet_data = \
                    pd.concat(
                        [all_samplesheet_data, samplesheet],
                        ignore_index=True)
        return all_samplesheet_data
    except Exception as e:
        raise ValueError(e)

def get_interop_index_stats(
      run_dir: str,
      reports_dir: str,
      interop_dir_name: str = 'InterOp',
      index_metrix_file: str = 'IndexMetricsOut.bin') \
        -> pd.DataFrame:
    try:
        flowcell_summary_data = list()
        index_metrix_file_path = \
            os.path.join(run_dir, interop_dir_name, index_metrix_file)
        if os.path.exists(index_metrix_file_path):
            index_csv = \
                subprocess.\
                check_output([
                    "/opt/interop/bin/index-summary",
                    run_dir,
                    "--csv=1"])
        else:
            with tempfile.TemporaryDirectory() as temp_dir:
                source_interop_path = \
                    os.path.join(run_dir, interop_dir_name)
                target_interop_dir = \
                    os.path.join(temp_dir, interop_dir_name)
                copytree(
                    source_interop_path,
                    target_interop_dir)
                for i in os.listdir(run_dir):
                    if i.endswith(".xml"):
                        source_path = \
                            os.path.join(run_dir, i)
                        target_path = \
                            os.path.join(temp_dir, i)
                        copy2(source_path, target_path)
                source_index_file = \
                    os.path.join(reports_dir, index_metrix_file)
                target_index_file = \
                    os.path.join(temp_dir, interop_dir_name, index_metrix_file)
                copy2(source_index_file, target_index_file)
                index_csv = \
                    subprocess.\
                    check_output([
                        "/opt/interop/bin/index-summary",
                        temp_dir,
                        "--csv=1"])
        if isinstance(index_csv, bytes):
            index_csv = index_csv.decode('utf-8')
        index_csv = index_csv.split("\n")
        counter = 99
        key = None
        lane_data = list()
        total_reads = dict()
        for i in index_csv:
            if i.startswith('Lane'):
                counter = 0
                key = i.split(",")[0]
                if key is not None:
                    key = key.replace("Lane", "").strip()
                lane_data = list()
                continue
            if counter < 2:
                lane_data.append(i)
                counter += 1
            if counter == 2:
                total_reads.update({key: lane_data})
        formatted_lane_data = dict()
        for lane_id, lane_data in total_reads.items():
            csv_data = StringIO('\n'.join(total_reads.get(lane_id)))
            data = pd.read_csv(csv_data).to_dict(orient='records')
            formatted_lane_data.update({lane_id: data})
        for lane_id, lane_data in formatted_lane_data.items():
            flowcell_summary_data.\
                append({
                    'Lane': lane_id,
                    'Total Reads': lane_data[0].get('Total Reads'),
                    'PF Reads': lane_data[0].get('PF Reads')})
        flowcell_summary_data = \
            pd.DataFrame(flowcell_summary_data)
        return flowcell_summary_data
    except Exception as e:
        raise ValueError(e)

def merge_known_samples(
    demultiplex_stats_df: pd.DataFrame,
    quality_metrics_df: pd.DataFrame,
    samplesheet_df: pd.DataFrame) \
        -> pd.DataFrame:
    try:
        temp_demultiplex_stats_df = \
            demultiplex_stats_df.\
            copy()
        temp_quality_metrics_df = \
            quality_metrics_df.\
            copy()
        temp_samplesheet_df = \
            samplesheet_df.\
            copy()
        filt_quality_metrics_df = \
            temp_quality_metrics_df[
                ~temp_quality_metrics_df['ReadNumber'].
                str.startswith("I")]
        if 'index2' in filt_quality_metrics_df.columns:
            filt_quality_metrics_df['Index'] = \
                filt_quality_metrics_df[['index', 'index2']].\
                agg('-'.join, axis=1)
        else:
            filt_quality_metrics_df['Index'] = \
                filt_quality_metrics_df['index'].copy()
        agg_combined_qmetrics_df = \
            filt_quality_metrics_df.\
            groupby(['Lane', 'SampleID', 'Index']).\
            agg({
                'Yield': 'sum',
                'Mean Quality Score (PF)': 'mean',
                '% Q30': 'mean'})
        joined_data1 = \
            temp_demultiplex_stats_df.\
            set_index(['Lane', 'SampleID', 'Index']).\
            join(agg_combined_qmetrics_df, how='left').\
            reset_index()
        joined_data1 = \
            joined_data1[[
                'Lane',
                'SampleID',
                'Index',
                '# Reads',
                '% Reads',
                '% Perfect Index Reads',
                'Yield',
                '% Q30',
                'Mean Quality Score (PF)']]
        joined_data1.\
            set_index('SampleID', inplace=True)
        temp_samplesheet_df.\
            set_index('Sample_ID', inplace=True)
        final_joined_data = \
            joined_data1.\
            join(temp_samplesheet_df[['Sample_Name', 'Sample_Project']].\
                 drop_duplicates(),
                 how='left')
        final_joined_data.index.rename('SampleID', inplace=True)
        final_joined_data.reset_index(inplace=True)
        final_joined_data = \
            final_joined_data[[
                'Lane',
                'Sample_Project',
                'SampleID',
                'Sample_Name',
                'Index',
                '# Reads',
                '% Reads',
                '% Perfect Index Reads',
                'Yield',
                '% Q30',
                'Mean Quality Score (PF)']]
        final_joined_data['Sample_Project'].\
            fillna('UNKNOWN', inplace=True)
        final_joined_data['Sample_Name'].\
            fillna('UNKNOWN', inplace=True)
        final_joined_data.\
            sort_values([
                'Lane',
                'Sample_Project',
                '# Reads'],
                ascending=[True, True, False],
                inplace=True)
        return final_joined_data
    except Exception as e:
        raise ValueError(e)

def get_flowcell_summary_plot(flowcell_summary_df: pd.DataFrame) -> dict:
    try:
        temp_flowcell_summary_df = \
            flowcell_summary_df.\
            copy()
        labels = \
            temp_flowcell_summary_df['Lane'].\
            map(lambda x: 'Lane {0}'.format(x)).\
            values.tolist()
        datasets = [{
            "label": "Total cluster raw",
            "data": temp_flowcell_summary_df["Total Reads"].astype(int).values.tolist(),
            "backgroundColor": 'rgba(255, 99, 132, 0.8)',
            "borderColor": 'rgba(255, 99, 132, 0.8)',
            "borderWidth": 1},{
            "label": "Total cluster pf",
            "data": temp_flowcell_summary_df["PF Reads"].astype(int).values.tolist(),
            "backgroundColor": 'rgba(54, 162, 235, 0.8)',
            "borderColor": 'rgba(54, 162, 235, 0.8)',
            "borderWidth": 1}]
        options = {
            "scales": {
                "y": {
                    "beginAtZero": "true"
                }
            },
            "responsive": "true",
            "title": {
                "display": "true",
                "text": 'Raw vs PF cluster counts per lane',
                "position": "bottom"
            },
            "plugins": {
                "legend": {
                    "position": 'top',
                }
            }
        }
        cj_plotter = ChartJSPlotter()
        data = {
            "labels": labels,
            "datasets": datasets
        }
        plot = \
            cj_plotter.\
            plot(
                data,
                'bar',
                options=options,
                w=400,
                h=300)
        return plot
    except Exception as e:
        raise ValueError(e)

def get_flowcell_project_summary_plot(merged_data: pd.DataFrame) \
        -> dict:
    try:
        temp_merged_data = \
            merged_data.\
            copy()
        project_groups = \
            temp_merged_data.\
            groupby(['Lane', 'Sample_Project']).\
            agg({'# Reads': 'sum'}).\
            reset_index()
        project_group_df = list()
        for lane_id, l_data in project_groups.groupby('Lane'):
            data_row = {'Lane': 'Lane {0}'.format(lane_id)}
            for project_id, p_data in l_data.groupby('Sample_Project'):
                data_row.update({project_id: p_data['# Reads'].sum()})
            project_group_df.append(data_row)
        project_group_df = \
            pd.DataFrame(project_group_df)
        data = list()
        data.append(project_group_df.columns.tolist())
        data.extend(project_group_df.values.tolist())
        options = {
            "title": 'Project summary plot',
            "width": 600,
            "height": 400,
            "chartArea": {"left": 50, "width": "60%"},
            "legend": {"position": 'right', "maxLines": 20, "fontSize": 3},
            "dataOpacity": 0.5,
            "colors": [
                '#0173B2', '#DE8F05', '#029E73', '#D55E00', '#CC78BC', '#CA9161',
                '#FBAFE4', '#949494', '#ECE133', '#56B4E9', '#0173B2'],
            "bar": {"groupWidth": '70%'},
            "isStacked": "percent"}
        gcplotter = GCPlotter()
        plot = \
            gcplotter.\
            plot(
                data,
                chart_type="BarChart",
                chart_package='corechart',
                options=options)
        return plot
    except Exception as e:
        raise ValueError(e)

def get_flowcell_project_summary_table(merged_data: pd.DataFrame) \
        -> pd.DataFrame:
    try:
        temp_merged_data = \
            merged_data.\
            copy()
        if 'Lane' in temp_merged_data.columns:
            summary_df = \
                temp_merged_data.\
                groupby(['Lane', 'Sample_Project']).\
                agg(len)['SampleID'].\
                reset_index()
        else:
            summary_df = \
                temp_merged_data.\
                groupby(['Sample_Project',]).\
                agg(len)['SampleID'].\
                reset_index()
        return summary_df
    except Exception as e:
        raise ValueError(e)

def get_sample_dist_plots(
        merged_data: pd.DataFrame,
        get_plots: bool = True) \
        -> dict:
    try:
        temp_merged_data = \
            merged_data.\
            copy()
        lane_plots = dict()
        cj_plotter = ChartJSPlotter()
        bg_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        border_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        options = {
            'scales': {
                'y': {
                    'beginAtZero': True
                },
                'xAxes': [{
                    'ticks': {
                        'fontSize': 8
                    }
                }]
            },
            'responsive': True,
            'plugins': {
                'legend': {
                    'position': 'top'}
            }}
        if 'Lane' in temp_merged_data.columns:
            for lane_id, l_data in temp_merged_data.groupby('Lane'):
                lane_samples = l_data['SampleID'].values.tolist()
                datasets = list()
                counter = 0
                for project_name, p_data in l_data.groupby('Sample_Project'):
                    read_counts = \
                        p_data.\
                        set_index('SampleID')['# Reads'].\
                        reindex(lane_samples).\
                        fillna(0).\
                        values.\
                        tolist()
                    datasets.append({
                        "label": project_name,
                        "data": read_counts,
                        "backgroundColor": bg_colors[counter],
                        "borderColor": border_colors[counter]})
                    counter += 1
                data = {
                    "labels": lane_samples,
                    "datasets": datasets}
                if get_plots:
                    plot = \
                        cj_plotter.\
                        plot(
                            data,
                            'bar',
                            options=options,
                            w=800,
                            h=400)
                    lane_plots.update({int(lane_id): plot})
                else:
                    lane_plots.update({int(lane_id): data})
        else:
            lane_samples = temp_merged_data['SampleID'].values.tolist()
            datasets = list()
            counter = 0
            for project_name, p_data in temp_merged_data.groupby('Sample_Project'):
                read_counts = \
                    p_data.\
                    set_index('SampleID')['# Reads'].\
                    reindex(lane_samples).\
                    fillna(0).\
                    values.\
                    tolist()
                datasets.append({
                    "label": project_name,
                    "data": read_counts,
                    "backgroundColor": bg_colors[counter],
                    "borderColor": border_colors[counter]})
                counter += 1
            data = {
                "labels": lane_samples,
                "datasets": datasets}
            if get_plots:
                plot = \
                    cj_plotter.\
                    plot(
                        data,
                        'bar',
                        options=options,
                        w=800,
                        h=400)
                lane_plots.update({1: plot})
            else:
                lane_plots.update({1: data})
        return lane_plots
    except Exception as e:
        raise ValueError(e)

def get_undetermined_table(
        unknown_df: pd.DataFrame) \
        -> pd.DataFrame:
    try:
        temp_unknown_df = \
            unknown_df.\
            copy()
        temp_unknown_df['index_RC'] = \
            temp_unknown_df['index'].\
                map(lambda x: x.translate(str.maketrans('ATGC', 'TACG'))[::-1])
        if 'index2' in temp_unknown_df.columns:
            temp_unknown_df['index2_RC'] = \
            temp_unknown_df['index2'].\
                map(lambda x: x.translate(str.maketrans('ATGC', 'TACG'))[::-1])
            temp_unknown_df['Barcode'] = \
                temp_unknown_df[['index', 'index2']].\
                agg('-'.join, axis=1)
            temp_unknown_df['Barcode_I1_RC'] = \
                temp_unknown_df[['index_RC', 'index2']].\
                agg('-'.join, axis=1)
            temp_unknown_df['Barcode_I2_RC'] = \
                temp_unknown_df[['index', 'index2_RC']].\
                agg('-'.join, axis=1)
        else:
            temp_unknown_df['Barcode'] = \
                temp_unknown_df['index'].\
                copy()
            temp_unknown_df['Barcode'] = \
                temp_unknown_df['index_RC'].\
                copy()
            temp_unknown_df['Barcode_I2_RC'] = ''
        temp_unknown_df = \
            temp_unknown_df[[
                'Lane',
                '# Reads',
                'Barcode',
                'Barcode_I1_RC',
                'Barcode_I2_RC']]
        temp_unknown_df.\
            sort_values([
                'Lane',
                '# Reads'],
                ascending=[True, False],
                inplace=True)
        return temp_unknown_df
    except Exception as e:
        raise ValueError(e)


def get_undetermined_plots(
        unknown_df: pd.DataFrame,
        get_plots: bool = True) \
        -> dict:
    try:
        bg_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        border_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        cj_plotter = ChartJSPlotter()
        temp_unknown_df = \
            unknown_df.\
            copy()
        if 'index2' in temp_unknown_df.columns:
            temp_unknown_df['Index'] = \
                temp_unknown_df[['index', 'index2']].\
                agg('-'.join, axis=1)
        else:
            temp_unknown_df['Index'] = \
                temp_unknown_df['index'].\
                copy()
        undetermined_plots = dict()
        options = {
            'scales': {
                'y': {
                    'beginAtZero': True
                },
                'xAxes': [{
                    'ticks': {
                        'fontSize': 8
                    }
                }]
            },
            'responsive': True,
            'plugins': {
                'legend': {
                    'position': 'top'}
            }}
        if 'Lane' in temp_unknown_df.columns:
            for lane_id, l_data in temp_unknown_df.groupby('Lane'):
                barcode_labels = l_data['Index'].values.tolist()
                barcode_count = l_data['# Reads'].values.tolist()
                data = {
                    "labels": barcode_labels,
                    "datasets": [{
                        'label': 'Undetermined - Lane {0}'.format(lane_id),
                        'data': barcode_count,
                        "backgroundColor": bg_colors[0],
                        "borderColor": border_colors[0]}]}
                if get_plots:
                    plot = \
                        cj_plotter.\
                        plot(
                            data,
                            'bar',
                            options=options,
                            w=800,
                            h=400)
                    undetermined_plots.\
                        update({int(lane_id): plot})
                else:
                    undetermined_plots.\
                        update({int(lane_id): data})
        else:
            barcode_labels = temp_unknown_df['Index'].values.tolist()
            barcode_count = temp_unknown_df['# Reads'].values.tolist()
            data = {
                "labels": barcode_labels,
                "datasets": [{
                    'label': 'Undetermined - Lane {0}'.format(1),
                    'data': barcode_count,
                    "backgroundColor": bg_colors[0],
                    "borderColor": border_colors[0]}]}
            if get_plots:
                plot = \
                    cj_plotter.\
                    plot(
                        data,
                        'bar',
                        options=options,
                        w=800,
                        h=400)
                undetermined_plots.\
                    update({1: plot})
            else:
                undetermined_plots.\
                    update({1: data})
        return undetermined_plots
    except Exception as e:
        raise ValueError(e)


def get_hop_plot(
        hop_df: pd.DataFrame,
        get_plot: bool = True) \
        -> dict:
    try:
        lane_data = list()
        options = {
            'scales': {
                'y': {
                    'beginAtZero': True
                },
                'xAxes': [{
                    'ticks': {
                        'fontSize': 8
                    }
                }]
            },
            'responsive': True,
            'plugins': {
                'legend': {
                    'position': 'top'}
            }}
        bg_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        border_colors = [
            'rgba(255, 99, 132, 0.8)',
            'rgba(54, 162, 235, 0.8)',
            'rgba(255, 206, 86, 0.8)',
            'rgba(75, 192, 192, 0.8)',
            'rgba(153, 102, 255, 0.8)',
            'rgba(255, 159, 64, 0.8)',
            'rgba(255, 159, 10, 0.8)',
            'rgba(255, 159, 192, 0.8)']
        cj_plotter = ChartJSPlotter()
        temp_hop_df = hop_df.copy()
        if 'index2' in temp_hop_df:
            temp_hop_df['Index'] = \
                temp_hop_df[['index', 'index2']].\
                agg('-'.join, axis=1)
        else:
            temp_hop_df['Index'] = \
                temp_hop_df['index'].copy()
        if 'Lane' in temp_hop_df:
            for lane_id, l_data in temp_hop_df.groupby('Lane'):
                data_row = {'Lane': lane_id}
                for index, i_data in l_data.groupby('Index'):
                    data_row.update({index: i_data['# Reads'].sum()})
                lane_data.append(data_row)
        else:
            data_row = {'Lane': 1}
            for index, i_data in temp_hop_df.groupby('Index'):
                data_row.update({index: i_data['# Reads'].sum()})
            lane_data.append(data_row)
        lane_data = pd.DataFrame(lane_data)
        lane_data.fillna(0, inplace=True)
        barcodes = [c for c in lane_data.columns if c != 'Lane']
        dataset = list()
        counter = 0
        for lane_id, l_data in lane_data.groupby('Lane'):
            barcode_count = l_data[barcodes].values.tolist()[0]
            dataset.append({
                'label': 'Hopping - Lane {0}'.format(lane_id),
                'data': barcode_count,
                "backgroundColor": bg_colors[counter],
                "borderColor": border_colors[counter]})
            counter += 1
        data = {
            'labels': barcodes,
            'datasets': dataset}
        if get_plot:
            data = \
                cj_plotter.\
                plot(
                    data,
                    'bar',
                    options=options,
                    w=800,
                    h=400)
        return data
    except Exception as e:
        raise ValueError(e)

def get_demult_report_and_plots_for_bclconvert(
        run_dir: str,
        reports_dir: str) -> None:
    try:
        if not os.path.exists(run_dir):
            raise IOError('Missing run dir {0}'.format(run_dir))
        required_report_csvs = [
            'Demultiplex_Stats.csv',
            'Quality_Metrics.csv',
            'Top_Unknown_Barcodes.csv',
            'SampleSheet.csv']
        for f in required_report_csvs:
            filepath = \
                os.path.join(reports_dir, f)
            if not os.path.exists(filepath):
                raise IOError('Missing report file {0}'.format(filepath))
        combined_demux_df = \
            combine_bclconvert_demultiplex_stats_csv([
                os.path.join(reports_dir, 'Demultiplex_Stats.csv')])
        combined_qmetrics_df = \
            combine_bclconvert_quality_metrics_csv([
                os.path.join(reports_dir, 'Quality_Metrics.csv')])
        combined_unknown_df = \
            combine_bclconvert_top_unknown_barcodes_csv([
                os.path.join(reports_dir, 'Top_Unknown_Barcodes.csv')])
        combined_ihop_df = pd.DataFrame()
        hop_plot = {}
        if os.path.exists(os.path.join(reports_dir, 'Index_Hopping_Counts.csv')):
            combined_ihop_df = \
                combine_bclconvert_index_hopping_counts_csv([
                    os.path.join(reports_dir, 'Index_Hopping_Counts.csv')])
            if len(combined_ihop_df.index) > 0:
                hop_plot = \
                    get_hop_plot(combined_ihop_df)
        samplesheet_df = \
            get_samplesheet_records(
                samplesheets=[os.path.join(reports_dir, 'SampleSheet.csv')])
        flowcell_summary_data = \
            get_interop_index_stats(run_dir=run_dir, reports_dir=reports_dir)
        flowcell_summary_data_plot = \
            get_flowcell_summary_plot(flowcell_summary_data)
        merged_df = \
            merge_known_samples(
                demultiplex_stats_df=combined_demux_df,
                quality_metrics_df=combined_qmetrics_df,
                samplesheet_df=samplesheet_df)
        flowcell_project_summary_plot = \
            get_flowcell_project_summary_plot(merged_df)
        flowcell_project_summary_table = \
            get_flowcell_project_summary_table(merged_df)
        sample_dist_plots = \
            get_sample_dist_plots(merged_df)
        undetermined_plots = \
            get_undetermined_plots(combined_unknown_df)
        undetermined_table = \
            get_undetermined_table(combined_unknown_df)
        return flowcell_summary_data_plot, flowcell_project_summary_plot, \
            merged_df, flowcell_project_summary_table, sample_dist_plots, \
            undetermined_plots, undetermined_table, combined_ihop_df, \
            hop_plot
    except Exception as e:
        raise ValueError(e)

## wikipedia code
def hamming_distance(s1: str, s2: str) -> int:
    """Return the Hamming distance between equal-length sequences."""
    if len(s1) != len(s2):
        raise ValueError("Undefined for sequences of unequal length.")
    return sum(el1 != el2 for el1, el2 in zip(s1, s2))

def calculate_min_hamming_distance(samplesheet_path: str) -> list:
    """Calculate min Hamming distance for a run"""
    try:
        samplesheet_data = \
            get_samplesheet_records(samplesheets=[samplesheet_path,])
        index_columns = ['index']
        if 'index2' in samplesheet_data.columns:
            samplesheet_data['index_length'] = \
                samplesheet_data.\
                apply(lambda x: len(x['index']) + len(x['index2']), axis=1)
            index_columns.append('index2')
        else:
            samplesheet_data['index_length'] = \
                samplesheet_data.\
                apply(lambda x: len(x['index']), axis=1)
        if 'Lane' in samplesheet_data.columns:
            group_columns = [
                'Lane', 'Sample_Project', 'index_length', 'Description']
        else:
            group_columns = [
                'Sample_Project', 'index_length', 'Description']
        output_rows = list()
        for grp_name, g_data in samplesheet_data.groupby(group_columns):
            min_hamming_dist = 10
            index_data = \
                g_data[index_columns].to_dict(orient='records')
            for i in range(0, len(index_data) - 1):
                for j in range(i+1, len(index_data) - 1):
                    if i != j:
                        index_i = index_data[i].get('index')
                        index_j = index_data[j].get('index')
                        hamming_dist1 = \
                            hamming_distance(s1=index_i, s2=index_j)
                        if min_hamming_dist > hamming_dist1:
                            min_hamming_dist = hamming_dist1
                        if 'index2' in index_columns:
                            index2_i = index_data[i].get('index2')
                            index2_j = index_data[j].get('index2')
                            hamming_dist2 = \
                                hamming_distance(s1=index2_i, s2=index2_j)
                            if min_hamming_dist > hamming_dist2:
                                min_hamming_dist = hamming_dist2
            group_row = dict(zip(group_columns, grp_name))
            group_row.update({'min_hamming_distance': min_hamming_dist})
            output_rows.append(group_row)
        return output_rows
    except Exception as e:
        raise ValueError(e)

In [None]:
reports_dir = '{{ REPORTS_DIR }}'
run_dir = '{{ RUN_DIR }}'

In [None]:
(flowcell_summary_data_plot, flowcell_project_summary_plot,
 merged_df, flowcell_project_summary_table, sample_dist_plots,
 undetermined_plots, undetermined_table, combined_ihop_df,
 hop_plot) = \
    get_demult_report_and_plots_for_bclconvert(
        reports_dir=reports_dir,
        run_dir=run_dir)

## Flowcell total reads vs passing filter reads

In [None]:
display(flowcell_summary_data_plot)

## Project summary plot

In [None]:
display(flowcell_project_summary_plot)

## Project summary for lane

In [None]:
HTML(flowcell_project_summary_table.to_html(index=False))

## Hamming distance

In [None]:
output_rows = \
    calculate_min_hamming_distance(
        samplesheet_path=os.path.join(reports_dir, 'SampleSheet.csv'))

def style_low_hamming_distance(s: pd.Series, props: str = '', cut_off: int = 3) -> pd.Series:
    return np.where(s < cut_off, props, '')

html = \
    pd.DataFrame(output_rows).style.\
    apply(style_low_hamming_distance, props='color:red;', cut_off=3, axis=0, subset=['min_hamming_distance',]).\
    apply(style_low_hamming_distance, props='background-color:#ffffb3;', cut_off=3, axis=0, subset=['min_hamming_distance',]).\
    to_html(index=False)
HTML(html)

## Sample read counts

A list of samples with index barcodes and read count information can be found here. Please note that this table is hidden by default.

In [None]:
def style_low_read(s: pd.Series, props: str = '', cut_off: int = 500) -> pd.Series:
    return np.where(s <= cut_off, props, '')

html = merged_df.style.\
     apply(style_low_read, props='color:red;', cut_off=500, axis=0, subset=['# Reads',]).\
     apply(style_low_read, props='background-color:#ffffb3;', cut_off=500, axis=0, subset=['# Reads',]).\
     to_html(index=False)
html = '<details><summary>Click to expand sample read count table</summary>' + html + '</details>'
HTML(html)

## Sample read count bar plot for lane

In [None]:
for lane_id, p in sample_dist_plots.items():
    print('Lane {0}'.format(lane_id))
    display(p)

## Undetermined reads

A list of undetermined barcodes with read count information can be found here. This table is hidden by default.

In [None]:
html = undetermined_table.to_html(index=False)
html = '<details><summary>Click to expand undetermined read count table</summary>' + html + '</details>'
HTML(html)

## Undetermined read count bar plot for lane

In [None]:
for lane_id, p in undetermined_plots.items():
    print('Lane {0}'.format(lane_id))
    display(p)

## Index hopping summary

In [None]:
HTML(combined_ihop_df.to_html(index=False))

## Index hopping bar plot for lane

In [None]:
hop_plot