In [12]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import scvi
import math
import matplotlib.pyplot as plt
import scipy.sparse
from glob import glob
from sklearn.preprocessing import OneHotEncoder
import random
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [13]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.9.1 anndata==0.8.0 umap==0.5.2 numpy==1.21.6 scipy==1.9.0 pandas==1.4.3 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.9.9 pynndescent==0.5.6


In [14]:
pseudobulk_sum = pd.read_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_static_qtl_calling/ebqtl_ipsc/pseudobulk_tmm/ebqtl_ipsc.pseudobulk_tmm.tsv",
                       sep="\t", index_col="gene")

In [15]:
pseudobulk_sum

Unnamed: 0_level_0,18486_IPSC,18489_IPSC,18498_IPSC,18499_IPSC,18501_IPSC,18502_IPSC,18505_IPSC,18507_IPSC,18508_IPSC,18511_IPSC,...,19160_IPSC,19190_IPSC,19193_IPSC,19203_IPSC,19204_IPSC,19206_IPSC,19207_IPSC,19210_IPSC,19225_IPSC,19257_IPSC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR1302-2HG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
OR4F5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,50.0,14.0,7.0,0.0,5.0,20.0,4.0,4.0,9.0,0.0,...,1.0,0.0,64.0,0.0,26.0,1.0,49.0,40.0,16.0,0.0
AL627309.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC136616.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC023491.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC007325.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC007325.4,153.0,47.0,31.0,28.0,71.0,328.0,8.0,34.0,286.0,4.0,...,13.0,0.0,302.0,0.0,239.0,3.0,143.0,258.0,52.0,12.0


In [16]:
cell_counts_filtered = pd.read_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_static_qtl_calling/ebqtl_ipsc/pseudobulk_tmm/sample_summary.tsv", 
                                   sep="\t", index_col=0)

## Subsample to 30

In [17]:
all_individuals = list(cell_counts_filtered['individual'])
kept_individuals = [i for i in all_individuals if not (cell_counts_filtered[cell_counts_filtered['individual'] == i]['dropped'].values[0])]
new_drops = random.sample(kept_individuals, (53-sum(cell_counts_filtered['dropped']))-30)

In [18]:
sample_summary = cell_counts_filtered.copy()
sample_summary['dropped'] = [(i in new_drops) or (cell_counts_filtered[cell_counts_filtered['individual']==i]['dropped'].values[0]) for i in cell_counts_filtered['individual']] 
sample_summary['dropped'] = sample_summary['dropped'].astype('bool')

In [19]:
inds_kept = [i for i in all_individuals if not (sample_summary[sample_summary['individual'] == i]['dropped'].values[0])]
cols_kept = [str(i) + "_IPSC" for i in inds_kept]

In [20]:
pseudobulk_subsampled = pseudobulk_sum[cols_kept]
pseudobulk_subsampled

Unnamed: 0_level_0,18907_IPSC,18858_IPSC,19153_IPSC,19144_IPSC,18913_IPSC,19210_IPSC,19193_IPSC,19093_IPSC,19159_IPSC,19207_IPSC,...,18508_IPSC,18511_IPSC,18505_IPSC,18501_IPSC,18516_IPSC,18522_IPSC,19099_IPSC,19257_IPSC,19152_IPSC,18870_IPSC
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR1302-2HG,7.0,17.0,14.0,4.0,7.0,1.0,4.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR4F5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.1,284.0,259.0,142.0,238.0,140.0,40.0,64.0,52.0,30.0,49.0,...,9.0,0.0,4.0,5.0,2.0,1.0,0.0,0.0,0.0,1.0
AL627309.3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC136616.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC023491.2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC007325.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC007325.4,1879.0,1343.0,1424.0,1345.0,472.0,258.0,302.0,206.0,115.0,143.0,...,286.0,4.0,8.0,71.0,66.0,17.0,3.0,12.0,4.0,1.0


In [21]:
pseudobulk_subsampled.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_static_qtl_calling/ebqtl_30sub/pseudobulk_tmm/ebqtl_30sub.pseudobulk_tmm.tsv",
                       sep="\t", index_label="gene")

In [22]:
sample_summary.to_csv("/project2/gilad/jpopp/ebQTL/data/benchmark_static_qtl_calling/ebqtl_30sub/pseudobulk_tmm/sample_summary.tsv", 
                      sep="\t")