In [None]:
import scanpy as sc
import pandas as pd
import scvelo as scv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn


In [None]:
crypto_2 = sc.read('../../../Data/notebooks_data/crypto_2.h5ad')

In [None]:
MT = ['MT' in i for i in crypto_2.var_names]
perc_mito = np.sum( crypto_2[:,MT].X, 1 ) / np.sum( crypto_2.X, 1 )
crypto_2.obs['perc_mito'] = perc_mito.copy()

In [None]:
sc.pl.scatter(crypto_2, x='total_counts', y='n_genes_by_counts', color='perc_mito', 
              title ='Nr of transcripts vs Nr detected genes, coloured by mitocondrial content')

In [None]:
sns.distplot(crypto_2.obs['total_counts'], bins=50)

In [None]:
sns.distplot(crypto_2.obs['n_genes_by_counts'], bins=50)

In [None]:
MIN_COUNTS = 3000
MAX_COUNTS = 15000
MIN_GENES = 2000
MAX_GENES = 5000
MAX_MITO = .1

In [None]:
sc.pl.scatter(crypto_2[ crypto_2.obs['total_counts']<MAX_COUNTS ], 
              x='total_counts', y='n_genes_by_counts', color='perc_mito',
              title ='Nr of transcripts vs Nr detected genes, coloured by mitocondrial content')

In [None]:
sc.pl.scatter(crypto_2[ crypto_2.obs['n_genes_by_counts'] > MIN_GENES ], 
              x='total_counts', y='n_genes_by_counts', color='perc_mito',
              title ='Nr of transcripts vs Nr detected genes, coloured by mitocondrial content')

In [None]:
sc.preprocessing.filter_cells(crypto_2, max_counts=MAX_COUNTS)

sc.preprocessing.filter_cells(crypto_2, min_counts=MIN_COUNTS)

sc.preprocessing.filter_cells(crypto_2, min_genes=MIN_GENES)

sc.preprocessing.filter_cells(crypto_2, max_genes=MAX_GENES)

sc.preprocessing.filter_genes(crypto_2, min_cells=10)

crypto_2 = crypto_2[crypto_2.obs['perc_mito']<MAX_MITO].copy()

In [None]:
crypto_2

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.ensemble import IsolationForest

df = crypto_2.obs[ ['n_genes_by_counts', 'total_counts', 
                    'perc_mito', 'pct_counts_in_top_50_genes'] ]

df2 = scale(df, axis=0)

pca = PCA(n_components=2)

Y = pca.fit_transform(df2)

clf = IsolationForest(random_state=0, contamination=.1)

pred = clf.fit_predict(df2)

pred = pd.Categorical(pred)
pred = pred.rename_categories(['Outlier','Cell'])
df['Category'] = pred

sns.scatterplot(Y[:,0],Y[:,1], hue = df.total_counts, 
                size=df.n_genes_by_counts, style = df.Category, 
                palette="Blues", sizes=(20, 200), hue_norm=(0, 100))

In [None]:
pred

In [None]:
crypto_2 = crypto_2[pred=='Cell'].copy()

In [None]:
crypto_2

In [None]:
sc.external.pp.scrublet(crypto_2, expected_doublet_rate=0.06, threshold=.2, random_state=12345)

In [None]:
sns.distplot(crypto_2.obs['doublet_score'])

In [None]:
crypto_2 = crypto_2[np.invert(crypto_2.obs['predicted_doublet'])].copy()

In [None]:
# TPM normalization and matrix copy
sc.pp.normalize_per_cell(crypto_2)
crypto_2.layers['umi_tpm'] = crypto_2.X.copy()

# matrix logarithmization and copy
sc.pp.log1p(crypto_2)
crypto_2.layers['umi_log'] = crypto_2.X.copy()

sc.pp.highly_variable_genes(crypto_2, n_top_genes=15000)

crypto_2.write('../../../Data/notebooks_data/crypto_2.filt.h5ad')

# standardized gene expressions and matrix copy
sc.pp.scale(crypto_2)
crypto_2.layers['umi_gauss'] = crypto_2.X.copy()

sc.preprocessing.pca(crypto_2, svd_solver='arpack', random_state=12345)

crypto_2

sc.plotting.pca_variance_ratio(crypto_2)

In [None]:
plt.rcParams['figure.figsize'] = (6,6) #reduce figure size
sc.pl.pca(crypto_2, color=['total_counts','SYCP1'])

sc.pp.neighbors(crypto_2, n_pcs=15)

sc.tools.umap(crypto_2, random_state=54321, n_components=2)

sc.plotting.umap(crypto_2, color=['total_counts'])

In [None]:
sc.plotting.umap(crypto_2, color=['perc_mito'])