In [7]:
!pip --quiet install MPRAlib

# Quality Metrics

This python notebook uses a MPRA dataset to generate some general quality metrics.

In [8]:
# Loading the MPRAlib library
from mpralib.mpradata import MPRAdata
# Loading other libraries
import pandas as pd
import numpy as np

# Load the data
mpradata = MPRAdata.from_file("../resources/IGVFFI0402JCOL.tsv.gz")


## Pearson Correlation

In [9]:
mpradata.barcode_threshold = 10

mpradata.pearson_correlation.flatten()[[1,2,5]].mean().round(3)

np.float64(0.943)

## Median barcodes per oligo

How to compute:

I am sure here we do NOT use the BC threshold!

- use average median across replicates
- compute median on all

In [10]:
mpradata.barcode_threshold = 1
grouped_data = mpradata.grouped_data
n_barcodes_replicate = []
for replicate in grouped_data.obs_names:
    replicate_data = grouped_data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcodes"] != 0]
    n_barcodes_replicate += [np.median(replicate_data.layers["barcodes"])]

print("Mean of RNA count median per replicate:")
print(int(np.mean(n_barcodes_replicate)))

all = grouped_data.layers["barcodes"][grouped_data.layers["barcodes"] != 0].flatten()

print(f"Median on all (flatten) Barcode counts where BC threshold >= {mpradata.barcode_threshold}:")
print(int(np.median(all[all != 0])))

Mean of RNA count median per replicate:
120
Median on all (flatten) Barcode counts where BC threshold >= 1:
121


## Median RNA Counts per oligo

How to compute:

- use average median across replicates
- compute median on all

Using BC thershold or not?

In [11]:
mpradata.barcode_threshold = 10
grouped_data = mpradata.grouped_data
n_rna_replicate = []
for replicate in grouped_data.obs_names:
    replicate_data = grouped_data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcodes"] != 0]
    n_rna_replicate += [np.median(replicate_data.layers["rna"])]

print("Mean of RNA count median per replicate:")
print(int(np.mean(n_rna_replicate)))

all = grouped_data.layers["rna"][grouped_data.layers["barcodes"] != 0].flatten()

print(f"Median on all (flatten) RNA counts where BC threshold >= {mpradata.barcode_threshold}:")
print(int(np.median(all[all != 0])))

Mean of RNA count median per replicate:
5584
Median on all (flatten) RNA counts where BC threshold >= 10:
5573


## Percent Oligos Passing

What is the total n? 

- All oligos in the assignment
- All oligos with at least 1 barcode (in any sample so observed). Then it is basically the bc_threshold that sets the filter

How to compute across replicates?

- Use the mean of the replicates.
- Merge replicates and compute from them.
- use only oligos where number of barcodes are larger than threshold in ALL replicates.

In [12]:
n_oligos = len(mpradata.oligos.values.unique())
print(n_oligos)
# or this one:
# len(mpradata.grouped_data.var["oligo"])

mpradata.barcode_threshold = 10
grouped_data = mpradata.grouped_data
n_oligos_replicate = []
for replicate in grouped_data.obs_names:
    replicate_data = grouped_data[replicate, :]
    replicate_data = replicate_data[:, replicate_data.layers["barcodes"] != 0]
    n_oligos_replicate += [len(replicate_data.var["oligo"])]

float(round(np.mean(n_oligos_replicate) / n_oligos * 100, 2))

11713


96.45

## median_assigned_barocdes (Assignment)

## fraction_assigned_oligos (assignment)