In [2]:
import sys
import time
import importlib

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
from loguru import logger

sys.path.append("../pypro/")
datetime = time.strftime("%Y%m%d_%H%M%S")
logger.remove()
logger.add(sys.stdout, format="<green>{time:HH:mm:ss.SS}</green> | <level>{level}</level> | {message}")

2

In [2]:
import filtering
importlib.reload(filtering)

<module 'filtering' from '/Users/jnimoca/Jose_BI/3_Python_Functions/PyProteomics/JupyterNotebooks/../pypro/filtering.py'>

# Test 2.1 Filtering with grouping

In [72]:
adata = ad.read_h5ad("../data/testdata/test2_filtering/test2_input/adata.h5ad")
adata = filtering.filter_invalid_proteins(adata=adata, threshold=0.7, grouping="Slide", qc_export_path="../data/testdata/test2_filtering/test2_output/test2.1_grouping_qc.csv")
adata.write("../data/testdata/test2_filtering/test2_output/test2.1_adata.h5ad")

[32m18:14:17.22[0m | [1mINFO[0m | Filtering proteins, they need to have 70.0% valid values to be kept
[32m18:14:17.23[0m | [1mINFO[0m | Filtering proteins by groups, Slide: [2, 1]
[32m18:14:17.23[0m | [34m[1mDEBUG[0m | Processing group: 2
[32m18:14:17.23[0m | [34m[1mDEBUG[0m | Group 2 has 3 samples and 3858 proteins
[32m18:14:17.23[0m | [34m[1mDEBUG[0m | Processing group: 1
[32m18:14:17.23[0m | [34m[1mDEBUG[0m | Group 1 has 8 samples and 3858 proteins
[32m18:14:17.23[0m | [1mINFO[0m | Any protein that has a minimum of 70.0 valid values in ANY group, will be kept
[32m18:14:17.24[0m | [1mINFO[0m | 3467 proteins were kept
[32m18:14:17.24[0m | [1mINFO[0m | 391 proteins were removed
[32m18:14:17.24[0m | [1mINFO[0m | Saving dataframe with filtering results to ../data/testdata/test2_filtering/test2_output/test2.1_grouping_qc.csv




# Test 2.2 Filtering without grouping

In [73]:
adata = ad.read_h5ad("../data/testdata/test2_filtering/test2_input/adata.h5ad")
adata = filtering.filter_invalid_proteins(adata=adata, threshold=0.7, qc_export_path="../data/testdata/test2_filtering/test2_output/test2.2_nogrouping_qc.csv")
adata.write("../data/testdata/test2_filtering/test2_output/test2.2_adata.h5ad")

[32m18:14:19.01[0m | [1mINFO[0m | Filtering proteins, they need to have 70.0% valid values to be kept
[32m18:14:19.01[0m | [1mINFO[0m | No grouping variable was provided
[32m18:14:19.01[0m | [34m[1mDEBUG[0m | adata has 11 samples and 3858 proteins
3350 proteins were kept
508 proteins were filtered out
[32m18:14:19.02[0m | [1mINFO[0m | Saving dataframe with filtering results to ../data/testdata/test2_filtering/test2_output/test2.2_nogrouping_qc.csv


# Test 2.3 Removing contaminants

In [76]:
adata = ad.read_h5ad("../data/testdata/test2_filtering/test2_input/adata.h5ad")
adata = filtering.filter_out_contaminants(adata=adata, print_summary=True, qc_export_path="../data/testdata/test2_filtering/test2_output/test2.3_contaminants_qc.csv")
adata.write("../data/testdata/test2_filtering/test2_output/test2.3_adata.h5ad")

[32m18:18:44.21[0m | [1mINFO[0m | Filtering out contaminants
the following proteins were filtered out:
+-----+----------------------+----------------------+-----------+
|     | Genes                | Protein.Names        | Species   |
|-----+----------------------+----------------------+-----------|
|   0 | nan                  | API_ACHLY            | ACHLY     |
|   1 | APOA1                | APOA1_BOVIN          | BOVIN     |
|   2 | ALB                  | ALBU_BOVIN           | BOVIN     |
|   3 | TF                   | TRFE_BOVIN           | BOVIN     |
|   4 | SRPP                 | SRPP_HEVBR           | HEVBR     |
|   5 | nan                  | REF_HEVBR            | HEVBR     |
|   6 | KRT85                | K2M3_SHEEP;KRT85_HUM | HUMAN     |
|     |                      | AN                   |           |
|   7 | KRT76;KRT84          | K22O_HUMAN;KRT84_HUM | HUMAN     |
|     |                      | AN                   |           |
|   8 | KRT17                | K1C1

# Test 2.4 Remove contaminants, keep some

not working

In [44]:
importlib.reload(filtering)
adata = ad.read_h5ad("../data/testdata/test2_filtering/test2_input/adata.h5ad")
adata = filtering.filter_out_contaminants(
    adata=adata,
    keep_genes=['KRT17', 'KRT18','FLG2'],
    print_summary=True, 
    qc_export_path="../data/testdata/test2_filtering/test2_output/test2.4_keep_contaminants_qc.csv")
adata.write("../data/testdata/test2_filtering/test2_output/test2.4_adata.h5ad")

[32m20:41:49.79[0m | [1mINFO[0m | Filtering out contaminants
[32m20:41:49.79[0m | [1mINFO[0m | Keeping ['KRT17', 'KRT18', 'FLG2'] from being removed
[32m20:41:49.79[0m | [1mINFO[0m | KRT17 being kept
[32m20:41:49.79[0m | [1mINFO[0m | Number of excluded contaminants: 18
[32m20:41:49.79[0m | [1mINFO[0m | KRT18 being kept
[32m20:41:49.79[0m | [1mINFO[0m | Number of excluded contaminants: 18
[32m20:41:49.79[0m | [1mINFO[0m | FLG2 being kept
[32m20:41:49.79[0m | [1mINFO[0m | Number of excluded contaminants: 19
the following proteins were filtered out:
+-----+----------------------+----------------------+-----------+
|     | Genes                | Protein.Names        | Species   |
|-----+----------------------+----------------------+-----------|
|   0 | nan                  | API_ACHLY            | ACHLY     |
|   1 | APOA1                | APOA1_BOVIN          | BOVIN     |
|   2 | TF                   | TRFE_BOVIN           | BOVIN     |
|   3 | ALB        