In [1]:
!pip --quiet install "MPRAlib==0.6.0"

# Quality Metrics

This python notebook uses a MPRA dataset to generate some general quality metrics.

In [2]:
# Loading the MPRAlib library
from mpralib.mpradata import MPRABarcodeData
# Loading other libraries
import pandas as pd
import numpy as np

# Load the data
mpra_barcode_data = MPRABarcodeData.from_file("../resources/barcode_counts.tsv.gz")
mpra_oligo_data = mpra_barcode_data.oligo_data

## Pearson Correlation


Use the median or maybe the min across replicates.

In [3]:
mpra_oligo_data.barcode_threshold = 10

print("Median Pearson correlation across replicates:")
print(np.median(mpra_oligo_data.correlation().flatten()[[1,2,5]]).round(3))

print("Min Pearson correlation across replicates:")
print(mpra_oligo_data.correlation().flatten()[[1,2,5]].min().round(3))

Median Pearson correlation across replicates:
0.977
Min Pearson correlation across replicates:
0.976


## Median barcodes per oligo

How to compute:

I am sure here we do NOT use the BC threshold!

- use average median across replicates
- Use the smalles median
- compute median on all

In [4]:
mpra_oligo_data.barcode_threshold = 1
n_barcodes_replicate = []
for replicate in mpra_oligo_data.obs_names:
    replicate_data = mpra_oligo_data.data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcode_counts"] >= mpra_oligo_data.barcode_threshold]
    n_barcodes_replicate += [np.median(replicate_data.layers["barcode_counts"])]

print("Median across replicates of median barcodes per oligo:")
print(int(np.median(n_barcodes_replicate)))

print("Minimum across replicates of median barcodes per oligo:")
print(int(np.min(n_barcodes_replicate)))

all = mpra_oligo_data.barcode_counts[mpra_oligo_data.barcode_counts >= mpra_oligo_data.barcode_threshold].flatten()

print(f"Median on all (flatten) Barcode counts where BC threshold >= {mpra_oligo_data.barcode_threshold}:")
print(int(np.median(all[all != 0])))

Median across replicates of median barcodes per oligo:
159
Minimum across replicates of median barcodes per oligo:
158
Median on all (flatten) Barcode counts where BC threshold >= 1:
159


## Median RNA Counts per oligo

How to compute:

- use average median across replicates
- compute median on all

Using BC thershold or not?

In [5]:
mpra_oligo_data.barcode_threshold = 10
n_rna_replicate = []
for replicate in mpra_oligo_data.obs_names:
    replicate_data = mpra_oligo_data.data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcode_counts"] >= mpra_oligo_data.barcode_threshold]
    n_rna_replicate += [np.median(replicate_data.layers["rna"])]

print("Median across replicates of median RNA counts per oligo:")
print(int(np.median(n_rna_replicate)))

print("Min across replicates of median RNA counts per oligo:")
print(int(np.min(n_rna_replicate)))


all = mpra_oligo_data.rna_counts[mpra_oligo_data.barcode_counts >= mpra_oligo_data.barcode_threshold].flatten()

print(f"Median on all (flatten) RNA counts where BC threshold >= {mpra_oligo_data.barcode_threshold}:")
print(int(np.median(all[all != 0])))

Median across replicates of median RNA counts per oligo:
1952
Min across replicates of median RNA counts per oligo:
1748
Median on all (flatten) RNA counts where BC threshold >= 10:
1890


## Percent Oligos Passing

What is the total n? 

- All oligos in the assignment
- All oligos with at least 1 barcode (in any sample so observed). Then it is basically the bc_threshold that sets the filter

How to compute across replicates?

- Use the mean of the replicates.
- Merge replicates and compute from them.
- use only oligos where number of barcodes are larger than threshold in ALL replicates.

In [6]:
n_oligos = len(mpra_barcode_data.oligos.values.unique())
print(n_oligos)
# or this one:
# len(mpradata.grouped_data.var["oligo"])

mpra_oligo_data.barcode_threshold = 10
n_oligos_replicate = []
for replicate in mpra_oligo_data.obs_names:
    replicate_data = mpra_oligo_data.data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcode_counts"] >= mpra_oligo_data.barcode_threshold]
    n_oligos_replicate += [len(replicate_data.var["oligo"])]

print(round(np.median(n_oligos_replicate) / n_oligos * 100, 2))

print(round(np.min(n_oligos_replicate) / n_oligos * 100, 2))

7440
97.96
97.94
