In [1]:
%matplotlib inline

import pandas as pd
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('../../Code/')
from utils import read_dataset_log
from matplotlib import rcParams, font_manager
import matplotlib

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# rcParams['pdf.fonttype'] = 42
# rcParams['ps.fonttype'] = 42
# rcParams['font.family'] = 'sans-serif'
# rcParams['font.sans-serif'] = ['Arial']

In [2]:
cgep_info = read_dataset_log('cGEP_Name')
cgep_info.index = cgep_info['cGep_Name']
rename_map = dict(zip(cgep_info['cGep_Name'], cgep_info['Short_Name']))
cgep_info = cgep_info.loc[cgep_info['Filter'].isnull(), ['Short_Name', 'Shortest_Name', 'Long_Name', 'Class', 'Num_Datasets']].dropna()
cgep_info['Num_Datasets'] = cgep_info['Num_Datasets'].astype(int)
cgep_info = cgep_info.sort_values(by=['Class', 'Num_Datasets'], ascending=[False,False])

In [3]:
cgep_paths = read_dataset_log('cGEP Paths').loc[0, :]
spectra_scores = pd.read_csv(cgep_paths['scores_spectra'], sep='\t', index_col=0).T
topgenes= {}
for c in spectra_scores.columns:
    topgenes[c] = ', '.join(spectra_scores[c].sort_values(ascending=False).index[:3])
    
topgenes = pd.Series(topgenes)
cgep_info['Top Genes'] = topgenes.loc[cgep_info.index]

In [4]:
dataset_presence = pd.read_csv(cgep_paths['clustering_fn'], sep='\t', index_col=0)
dataset_presence = ~dataset_presence[['TBRU', 'UK-Covid', 'COMBAT', 'HIV-Vaccine', 'Pan-Tissue', 'Pan-Cancer', 'AMP-RA']].isnull()
cgep_info = pd.merge(left=cgep_info, right=dataset_presence, left_index=True, right_index=True, how='left')

In [6]:
cgep_info.columns = [('GEP Info', 'Short Name'), ('GEP Info', 'Abbreviated Name'), ('GEP Info', 'Long_Name'), ('GEP Info', 'GEP Class'), ('GEP Info', '# Datasets Found'),
                     ('GEP Info', 'Top Genes'), ('Found in Dataset', 'TBRU'),
       ('Found in Dataset','UK-Covid'), ('Found in Dataset', 'COMBAT'), ('Found in Dataset','HIV-Vaccine'),
                     ('Found in Dataset','Pan-Tissue'), ('Found in Dataset','Pan-Cancer'),
       ('Found in Dataset','AMP-RA')]

In [7]:
cgep_info.columns = pd.MultiIndex.from_tuples(cgep_info.columns)

In [8]:
cgep_info.head()

Unnamed: 0_level_0,GEP Info,GEP Info,GEP Info,GEP Info,GEP Info,GEP Info,Found in Dataset,Found in Dataset,Found in Dataset,Found in Dataset,Found in Dataset,Found in Dataset,Found in Dataset
Unnamed: 0_level_1,Short Name,Abbreviated Name,Long_Name,GEP Class,# Datasets Found,Top Genes,TBRU,UK-Covid,COMBAT,HIV-Vaccine,Pan-Tissue,Pan-Cancer,AMP-RA
cGep_Name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
CD8_EM,CD8-EM,CD8-EM,CD8 Effector Memory,Lineage,7,"CMC1, GZMK, AB_CD244",True,True,True,True,True,True,True
MAIT,MAIT,MAIT,MAIT,Lineage,7,"SLC4A10, AB_TCR-V-7.2, KLRB1",True,True,True,True,True,True,True
Treg,Treg,Treg,Treg,Lineage,7,"FOXP3, AB_CD25, RTKN2",True,True,True,True,True,True,True
CD4-Naive_TBRU,CD4-Naive,CD4-Naive,CD4 Naive,Lineage,6,"CCR7, SELL, LEF1",True,True,False,True,True,True,True
KLRC2/GNLY_gdT,TEMRA,TEMRA,TEMRA,Lineage,6,"KIR2DL3, KLRC2, TYROBP",False,True,True,True,True,True,True


In [9]:
fn = '../../Tables/Table_S1_GEP_Summary.xlsx'

In [10]:
cgep_info.index = np.arange(1, cgep_info.shape[0]+1)

In [11]:
cgep_info.to_excel(fn, index=True)