In [None]:
import pandas as pd
import os
import anndata as ad
import numpy as np
import scanpy as sc

In [None]:
# TMP
PROJECT_PATH = '/home/fichtner/projects/footprintQTL'
DATA_PATH = '/omics/groups/OE0540/internal/projects/HCA_organoid_2/cemm_sabrina-20Jul2022/'
RNA_AD = 'outputs_allsamples/sabrina_allsamples_rna_final_after_atac.h5ad'

import sys
sys.path.append(PROJECT_PATH + "/code")
from helpers.helpers import ct_format, get_anndata_coldata

cells_coldata = get_anndata_coldata(os.path.join(DATA_PATH, RNA_AD))[0]
dids_borgs = set(cells_coldata['donor_id'].unique().tolist())
del cells_coldata

In [None]:
# Exclude for some weird reason, samples in QC but not in other files
excl = {'SAMEA2474458', 'SAMEA2555012'}

# Chromatin accessibility (phenotype)

In [None]:
borgs_tile_mat = ad.read_h5ad('../data/datasets/hca_brain-organoids_processed/chromatin_accessibility/peak-matrix_rna-qc-cells_norm-reads-in-tss.h5ad')
borgs_tile_mat

In [None]:
dids_tiles = set(borgs_tile_mat.obs['donor_id'])

In [None]:
print(f'len dids_tiles: {len(dids_tiles)}')
print(f'len dids_borgs: {len(dids_borgs)}')
print(dids_tiles - dids_borgs)
print(dids_borgs - dids_tiles)

## Pilot: Midbrain EN & DRD4

### Peak locations

In [None]:
ph_pilot = borgs_tile_mat[borgs_tile_mat.obs['celltype'] == 'Midbrain EN', borgs_tile_mat.var['nearest_gene'] == 'DRD4']
ph_pilot

In [None]:
ph_pilot.obs

In [None]:
ph_pilot.var

In [None]:
ph_pilot.var[['peak_name', 'chr', 'start', 'end']].to_csv("../data/datasets/hca_brain-organoids_processed/covariates/peak_locations.tsv", sep='\t', index=False)

### Peak CA values

In [None]:
ph_pilot_df = ph_pilot.to_df().copy()
ph_pilot_df = ph_pilot_df.rename(columns=ph_pilot.var['peak_name'].to_dict())
ph_pilot_df

In [None]:
ph_pilot_df = ph_pilot_df.join(ph_pilot.obs['donor_id'], how='inner')

del ph_pilot

ph_pilot_df

In [None]:
ph_pilot_df = ph_pilot_df.groupby('donor_id').mean().T
ph_pilot_df = ph_pilot_df.rename_axis('peak_id')
ph_pilot_df

In [None]:
samples_set = list(set(ph_pilot_df.columns) - excl)

ph_pilot_df = ph_pilot_df.loc[samples_set, :]
ph_pilot_df

In [None]:
ph_pilot_df.round(6).to_csv("../data/datasets/hca_brain-organoids_processed/chromatin_accessibility/peak-matrix_rna-qc-cells_norm-reads-in-tss.tsv", sep='\t')

### CA PCs

In [None]:
grouped_ad = ad.AnnData(ph_pilot_df.T)
grouped_ad.obs_names = ph_pilot_df.columns.tolist()
grouped_ad.var_names = ph_pilot_df.index.tolist()

del ph_pilot_df

grouped_ad

In [None]:
sc.pp.pca(grouped_ad)

In [None]:
ca_pcs = pd.DataFrame(grouped_ad.obsm['X_pca'][:, 0:20]).set_index(grouped_ad.obs_names)
ca_pcs.columns = [f'caPC_{str(i)}' for i in range(1, 21)]
ca_pcs = ca_pcs.T
ca_pcs = ca_pcs.rename_axis('id')
ca_pcs[samples_set].to_csv("../data/datasets/hca_brain-organoids_processed/covariates/ca_pcs.tsv", sep='\t')
ca_pcs

In [None]:
del grouped_ad
del ca_pcs

In [None]:
borgs_tile_mat.file.close()

## Filter features

In [None]:
# Subset to cell type
borgs_men = borgs_tile_mat[borgs_tile_mat.obs['celltype'] == 'Midbrain EN', :]
borgs_men

In [None]:
# Make df and aggregate across cells of the same donor

borgs_men_df = borgs_men.to_df().copy()



borgs_men_df = borgs_men_df.rename(columns=borgs_men.var['peak_name'].to_dict())
borgs_men_df = borgs_men_df.join(borgs_men.obs['donor_id'], how='inner')

borgs_men_df = borgs_men_df.groupby('donor_id').mean()
borgs_men_df = borgs_men_df.rename_axis('donor')

borgs_men_df = borgs_men[samples_set]

del borgs_men

borgs_men_df

In [None]:
# Number of non-sparse peaks

thresh = 0.05
tot = len(borgs_men_df.columns)
npeaks_005 = sum((borgs_men_df != 0).mean() > thresh)
# npeaks_005 = borgs_men_df.loc[:, ((borgs_men_df != 0).mean() > thresh).values]
print(f'Total nr peaks: {tot:.2f}' \
      f'Filtered nr peaks: {npeaks_005}' \
      f'Proportion: {npeaks_005 / tot:.2f}'
    )

thresh = 0.10
npeaks_010 = sum((borgs_men_df != 0).mean() > thresh)
# npeaks_010 = borgs_men_df.loc[:, ((borgs_men_df != 0).mean() > thresh).values]
print(f'Total nr peaks: {tot:.2f}' \
      f'Filtered nr peaks: {npeaks_010}' \
      f'Proportion: {npeaks_010 / tot:.2f}'
    )

In [None]:
del borgs_men_df

In [None]:
# Density of CA

non0 = borgs_men.X.count_nonzero()
all_ = np.product(borgs_men.X.shape)
print(f'Density: {non0 / all_}')

# Genotype

In [None]:
gt = pd.read_csv('../data/datasets/hca_brain-organoids_processed/covariates/genotype.tsv', sep='\t', header=0, index_col=0).rename_axis('snp_id')
gt = gt.rename(index=lambda x: 'chr' + str(x))
gt

In [None]:
dids_genotypes = set(gt.columns.tolist())

In [None]:
print(f'len dids_genotypes: {len(dids_genotypes)}')
print(f'len dids_borgs: {len(dids_borgs)}')
print(dids_genotypes - dids_borgs)
print(dids_borgs - dids_genotypes)

In [None]:
snps_set = gt.index

In [None]:
gt = gt[samples_set]
gt.to_csv('../data/datasets/hca_brain-organoids_processed/covariates/genotype_NA.tsv', sep='\t', na_rep='NaN')

# Covariates

## Population structure

In [None]:
gt_pcs = pd.read_csv("/omics/groups/OE0540/internal/projects/HCA_organoid_2/cemm_sabrina-20Jul2022/Genotypes/GT_PCs.tsv", sep="\t", index_col=0)
gt_pcs_matrixQTL = gt_pcs.T
gt_pcs_matrixQTL = gt_pcs_matrixQTL.rename_axis('id')

In [None]:
gt_pcs_matrixQTL

In [None]:
parent_dir = "/home/fichtner/projects/footprintQTL/data/datasets/hca_brain-organoids_processed/covariates/"

if not os.path.isdir(parent_dir):
    os.makedirs(parent_dir)

In [None]:
gt_pcs_matrixQTL[samples_set].to_csv("/home/fichtner/projects/footprintQTL/data/datasets/hca_brain-organoids_processed/covariates/genotype_pcs.tsv", sep='\t')

In [None]:
dids_cov = set(gt_pcs_matrixQTL.columns.tolist())

In [None]:
print(f'len dids_cov: {len(dids_cov)}')
print(f'len dids_borgs: {len(dids_borgs)}')
print(dids_cov - dids_borgs)
print(dids_borgs - dids_cov)

In [None]:
del gt_pcs
del gt_pcs_matrixQTL

## SNP locations

In [None]:
snps = pd.read_csv("../data/datasets/hca_brain-organoids_processed/covariates/genotype.vcf",
                   sep="\t",
                   comment="#",
                   header=None,
                   names=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMEA2536416", "SAMEA3854313", "SAMEA3853161", "SAMEA3966392", "SAMEA2625592", "SAMEA2698309", "SAMEA3962624", "SAMEA3965165", "SAMEA2595426", "SAMEA3962389", "SAMEA3973854", "SAMEA3847031", "SAMEA2547622", "SAMEA3854308", "SAMEA3964906", "SAMEA2536410", "SAMEA3735541", "SAMEA2590621", "SAMEA2698315", "SAMEA2445784", "SAMEA3973848", "SAMEA3962524", "SAMEA3485962", "SAMEA2609965", "SAMEA3851971", "SAMEA4089454", "SAMEA2613912", "SAMEA3963885", "SAMEA3963884", "SAMEA2613897", "SAMEA4089453", "SAMEA3851966", "SAMEA3974241", "SAMEA4343052", "SAMEA3962619", "SAMEA3853122", "SAMEA3974015", "SAMEA2518325", "SAMEA2433566", "SAMEA2459959", "SAMEA2675459", "SAMEA2682671", "SAMEA2645805", "SAMEA2711371", "SAMEA2593858", "SAMEA2445790", "SAMEA2518322", "SAMEA2547619", "SAMEA2627577", "SAMEA2547637", "SAMEA3977051", "SAMEA2445779", "SAMEA2609971", "SAMEA3974247", "SAMEA2658107", "SAMEA3754205", "SAMEA2547644", "SAMEA2678742", "SAMEA2474454", "SAMEA2627566", "SAMEA2518334", "SAMEA3967410", "SAMEA2536404", "SAMEA2627578", "SAMEA2518324", "SAMEA3968753", "SAMEA2547899", "SAMEA3754201", "SAMEA2570435", "SAMEA2658084", "SAMEA4451098", "SAMEA2678738", "SAMEA2547633", "SAMEA2609972", "SAMEA2707499", "SAMEA2555017", "SAMEA2420640", "SAMEA4451109", "SAMEA3448738", "SAMEA2627141", "SAMEA3485958", "SAMEA2536413", "SAMEA2464819", "SAMEA2627567", "SAMEA2645814"],
                   index_col=False
                  )

snps = snps[['ID', 'CHROM', 'POS']].rename(columns={'ID': 'snp_id', 'CHROM': 'chr', 'POS': 'pos'})
snps = snps.set_index('snp_id')

In [None]:
snps

In [None]:
snps = snps.loc[snps_set]
snps.to_csv("../data/datasets/hca_brain-organoids_processed/covariates/snp_locations.tsv", sep='\t')

In [None]:
del snps