In [25]:
import scvelo as scv
import loompy
import igraph as ig
from anndata import AnnData as ad
import pandas as pd
import re
import scanpy as sc 
import matplotlib.pyplot as plt 
import numpy as np 
import scipy as sp

adata = sc.read_h5ad(filename = "/dkfz/groups/OE0540/users/ruehle/rnavelocity/velocyto/scvelo_output/use/dynamical/filteredpoordiff_cpmtop2000DE_pca3harmony_celltypes.h5ad")


In [6]:
#pseudotime mean aggregation at donor and run level, run separately per day of interest (doi)

doi = "day2"

day2 = adata[adata.obs['timepoint'] == doi, :]
exp_day2 = day2.obs['run'].tolist()
exp_day2 = list(dict.fromkeys(exp_day2))

day0 = adata[adata.obs['timepoint'] == 'day0', :]
day1 = adata[adata.obs['timepoint'] == 'day1', :]
day2 = adata[adata.obs['timepoint'] == 'day2', :]
day3 = adata[adata.obs['timepoint'] == 'day3', :]

day0 = day0.obs[['donor', 'run', 'velocity_length']]
day1 = day1.obs[['donor', 'run', 'velocity_length']]
day2 = day2.obs[['donor', 'run', 'velocity_length']]
day3 = day3.obs[['donor', 'run', 'velocity_length']]

grouped_df = day2.groupby(["run", "donor"]).mean()
day2 = grouped_df[grouped_df['velocity_length'].notna()]


#obtain list of sample_names (for aggregated data)
sample_ids = []

for name in day2.index:
    item = name[1] + "-" + name[0]
    sample_ids.append(item)

day2['feature_id'] = sample_ids
day2 = day2.reset_index()
day2 = day2[['feature_id', "velocity_length"]]

day2.T.to_csv(r'/omics/groups/OE0540/internal/users/ruehle/rnavelocity/vqtls/data/velocyto/deterministic_length/top2000DE/input_day2/pheno_file_pseudotime_day2.txt', header=False, index=True, sep='\t', mode='a')

In [7]:
index = sample_ids
exp_day2.insert(0, "sample_id")

covariates = pd.DataFrame(index=index, columns=exp_day2)
covariates['sample_id'] = sample_ids
covariates = covariates.fillna(0) # with 0s
covariates

#fill covariance matrix 
#if substring in fullstring:

for index, row in covariates.iterrows():
    for column in covariates:
        if column in index:
            covariates.loc[index, column] = 1
covariates

covariates.to_csv(r'/omics/groups/OE0540/internal/users/ruehle/rnavelocity/vqtls/data/velocyto/deterministic_length/top2000DE/input_day2/covariance_pseudotime_day2.txt', header=True, index=False, sep='\t', mode='a')

feature_anno_file = pd.DataFrame()
#Feature annotation file
data = [['feature_id', 'chromosome', 'start', 'end', 'ensembl_gene_id', 'feature_strand'], ['velocity_length', 1, 1, 1, 'BLUB', '+']]
feature_anno_file = pd.DataFrame(data)
#feature_anno_file.head()
feature_anno_file.to_csv(r'/omics/groups/OE0540/internal/users/ruehle/rnavelocity/vqtls/data/velocyto/deterministic_length/feature_anno_file_pc_pseudotime.txt', header=False, index=False, sep='\t', mode='a')

Unnamed: 0,0,1,2,3,4,5
0,feature_id,chromosome,start,end,ensembl_gene_id,feature_strand
1,velocity_length,1,1,1,BLUB,+


In [8]:
#link file alias sample mapping file
#maps genotype names to sample names 

donormeta = pd.read_csv('/icgc/dkfzlsdf/analysis/B260/projects/HipSci/openAccess/endoderm_differentation/metadata/HipSci_donor_annotation.tsv', sep = ';', header = 0)

subdf = pd.DataFrame()
final = pd.DataFrame()
subdf['sample_id'] = day2.T.loc["feature_id"]

subdf['sample_map'] = "zero"
subdf['donor'] = "donor"

#remove run donor thing and put it into column donor of subdf 
for index, row in subdf.iterrows():
    donor_o = str(row['sample_id'])
    donor_only = donor_o[:-6]
    row['donor'] = donor_only

for sample_index, sample_row in subdf.iterrows():
    sample = sample_row['sample_id']
    for genomic_index, genomic_row in donormeta.iterrows():
        if genomic_row['DonorLine'] in sample:
            sample_row['sample_map'] = genomic_row["Sample2"]
        else:
            None

final = subdf[['sample_map', 'sample_id']]
#final.head()

final.to_csv(r'/omics/groups/OE0540/internal/users/ruehle/rnavelocity/vqtls/data/velocyto/deterministic_length/top2000DE/input_day2/sample_mapping_file_pseudotime_day2.txt', header=False, index=False, sep='\t', mode='a')