In [12]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler matplotlib_venn goatools gseapy scperturb biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

In [13]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

In [14]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')

In [15]:
sorted([file.name for file in (TEMPDIR / 'UrsuBoehm2022/').glob('*')])

['GSE161824_A549_KRAS.processed.cells.csv',
 'GSE161824_A549_KRAS.processed.cells.metadata.csv',
 'GSE161824_A549_KRAS.processed.genes.csv',
 'GSE161824_A549_KRAS.processed.genes.metadata.csv',
 'GSE161824_A549_KRAS.processed.matrix.mtx',
 'GSE161824_A549_KRAS.rawcounts.cells.csv',
 'GSE161824_A549_KRAS.rawcounts.genes.csv',
 'GSE161824_A549_KRAS.rawcounts.matrix.mtx',
 'GSE161824_A549_KRAS.variants2cell.csv',
 'GSE161824_A549_TP53.processed.cells.csv',
 'GSE161824_A549_TP53.processed.cells.metadata.csv',
 'GSE161824_A549_TP53.processed.genes.csv',
 'GSE161824_A549_TP53.processed.genes.metadata.csv',
 'GSE161824_A549_TP53.processed.matrix.mtx',
 'GSE161824_A549_TP53.rawcounts.cells.csv',
 'GSE161824_A549_TP53.rawcounts.genes.csv',
 'GSE161824_A549_TP53.rawcounts.matrix.mtx',
 'GSE161824_A549_TP53.variants2cell.csv',
 'GSE161824_RAW.tar',
 'GSE161824_SCEVIP.README.pdf',
 'filelist.txt']

In [65]:
# Read Data
from scipy.io import mmread
from scipy.sparse import csr_matrix
keys = ['KRAS', 'TP53']
adatas = {}
for key in tqdm(keys):
    var = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.genes.csv', index_col=0, names=['gene_symbol'])
    X = csr_matrix(mmread(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.matrix.mtx'))
    
    obs = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.rawcounts.cells.csv', index_col=0, names=['cell_barcode'])
    variants = pd.read_csv(TEMPDIR / f'UrsuBoehm2022/GSE161824_A549_{key}.variants2cell.csv', sep='\t')
    variants.set_index('cell', inplace=True)
    assert np.sum(obs.index!=variants.index)==0
    var_counts = variants.iloc[:,2:-2]
    variants.drop(var_counts.columns, axis=1, inplace=True)
    obs = pd.concat([obs, variants], axis=1)

    adata = sc.AnnData(X, obs, var)
    adata.obsm['Variant_Counts'] = var_counts
    adatas[key] = adata

In [67]:
adata = adatas['KRAS']

In [69]:
adata.obs

Unnamed: 0,batch,n_counts,variant,variant.detailed_multi
AAACCTGCAACGCACC-1-0,0,25694.0,unassigned,unassigned
AAACCTGCAATGGTCT-1-0,0,34868.0,T50T,T50T
AAACCTGCAGGACGTA-1-0,0,25170.0,unassigned,unassigned
AAACCTGCAGTAAGAT-1-0,0,31500.0,unassigned,unassigned
AAACGGGAGACAGAGA-1-0,0,22654.0,T127T,T127T
...,...,...,...,...
TTTGTCAAGAGGGCTT-1-31,31,15471.0,M170L,M170L
TTTGTCAAGGGATACC-1-31,31,21986.0,Q99E,Q99E
TTTGTCACACATGACT-1-31,31,17064.0,G13R,G13R
TTTGTCACAGAAGCAC-1-31,31,23769.0,Y166H,Y166H


In [None]:
adata.obs.rename({'n_counts': 'ncounts'}, axis=1, inplace=True)

In [71]:
adata.obs.variant.unique()

array(['unassigned', 'T50T', 'T127T', 'G13V', 'Q22K', 'T20M', 'Q61P',
       'S17S', 'Q99E', 'G77A', 'M111L', 'Q61A', 'D30D', 'I36M', 'A146V',
       'E31K', 'G12A', 'G60D', 'R68S', 'P34R', 'E63K', 'multiple',
       'F141L', 'A66A', 'K178K', 'T74A', 'D119G', 'WT', 'G13C', 'G75A',
       'K88K', 'L159S', 'V112I', 'S136N', 'I163S', 'T158A', 'A146P',
       'D173D', 'R135T', 'A146T', 'A59G', 'R164Q', 'G12F', 'Q61R',
       'K117R', 'T74T', 'K117N', 'C118S', 'AG59GV', 'T144T', 'N26Y',
       'K169K', 'A155G', 'G13R', 'K176Q', 'A130V', 'D57N', 'T50I', 'L52F',
       'T144P', 'R149K', 'G12I', 'G13E', 'P110S', 'T50P', 'V8V', 'P34L',
       'G60S', 'K147N', 'V14L', 'G60V', 'T20R', 'L19F', 'G12S', 'L79I',
       'K147T', 'Q25H', 'G12Y', 'G12C', 'M170L', 'K179R', 'G12D', 'Q61L',
       'T20T', 'V14I', 'C185Y', 'Q61H', 'T158T', 'G12R', 'K5E', 'A59E',
       'Q22H', 'Y166H', 'R41K', 'A59T', 'Q61K', 'G12V', 'T58I', 'AG11TD',
       'D33E'], dtype=object)