### Notes
1. Pulled the top5 terms for each of the post-transplant gene sets
2. Calculated overlap, if the term was not in the top five of a given gene set then overlap was set to 0

In [1]:
import glob
import os
import datetime
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
import random
import seaborn as sns
import time
import numpy as np

### Output Directory

In [3]:
# Date and date directory

date = datetime.date.today()

print(date)

2022-12-16


In [2]:
#Create Output Dir variable
output_dir = f'/proj/dllab/jfoster/serody_project/results/Fig2_mILC2_mILC2-GFP_scMultiome_analysis_RNA_Integration/enrichrData/'
output_dir

'/proj/dllab/jfoster/serody_project/results/Fig2_mILC2_mILC2-GFP_scMultiome_analysis_RNA_Integration/enrichrData/'

### read in files

In [10]:
enrichr_files = glob.glob(f'{output_dir}*.txt')


In [11]:
# Isolate cluster names from file name 
clust_num = [os.path.basename(file).split("_")[-1].split(".")[0] for file in enrichr_files]

clusterNames = [f'cluster{num}' for num in clust_num]


In [12]:
enrichr_data = [pd.read_csv(file, sep ='\t') for file in enrichr_files]

# Annotate each dataset with cluster
enrichr_data = [df.assign(cluster = cluster) for df,cluster in zip(enrichr_data,clusterNames)]

# # Isolate the top 10
# # I need to isolate the to
enrichr_data = [df.head(5) for df in enrichr_data]

# Select Term, adj P-value, cluster
enrichr_data = [df[['Term','Overlap','Adjusted P-value','cluster']] for df in enrichr_data]

# Calculate overlap percentage
overlap_pct = [df['Overlap'].apply(lambda x: int(x.split("/")[0]) / int(x.split("/")[1])) for df in enrichr_data]

for df, overlap in zip(enrichr_data, overlap_pct):
    df['Overlap'] = overlap 

enrichr_data  = pd.concat(enrichr_data, ignore_index=True)

# Copied for appending overlap to final matrix
overlap_data = enrichr_data


In [13]:
overlap_data

Unnamed: 0,Term,Overlap,Adjusted P-value,cluster
0,NK cells,0.089686,1.031673e-18,clusterPost1
1,thymocyte SP CD4+,0.103448,2.068119e-07,clusterPost1
2,T-cells foxP3+,0.133929,7.18507e-07,clusterPost1
3,follicular B-cells,0.058043,4.342972e-06,clusterPost1
4,lymph nodes,0.133333,1.155095e-05,clusterPost1
5,NK cells,0.089686,1.031673e-18,clusterPost2
6,thymocyte SP CD4+,0.103448,2.068119e-07,clusterPost2
7,T-cells foxP3+,0.133929,7.18507e-07,clusterPost2
8,follicular B-cells,0.058043,4.342972e-06,clusterPost2
9,lymph nodes,0.133333,1.155095e-05,clusterPost2


In [14]:
# Long-to-Wide
enrichr_data = enrichr_data.pivot(index='cluster', columns='Term', values='Adjusted P-value')

# nan to 1
enrichr_data = enrichr_data.fillna(1)

enrichr_data = -np.log10(enrichr_data)

enrichr_data = enrichr_data.replace(-0.0, 0) 


In [15]:
# wide to long
enrichr_data = pd.melt(enrichr_data.reset_index(), id_vars='cluster',value_vars=enrichr_data.columns.to_list())

In [16]:
# Add overlap pct to the matrix
enrichr_data = enrichr_data.assign(overlap = float(0))

for idx, row in overlap_data.iterrows():
    for idx2, row2 in enrichr_data.iterrows():
        if row['Term'] == row2['Term'] and row['cluster'] == row2['cluster']:
            enrichr_data.at[idx2,'overlap'] = row['Overlap']


In [17]:
enrichr_data

Unnamed: 0,cluster,Term,value,overlap
0,clusterPost1,NK cells,17.986458,0.089686
1,clusterPost2,NK cells,17.986458,0.089686
2,clusterPost3,NK cells,20.02711,0.095665
3,clusterPost1,T-cells CD8+,0.0,0.0
4,clusterPost2,T-cells CD8+,0.0,0.0
5,clusterPost3,T-cells CD8+,5.226038,0.10241
6,clusterPost1,T-cells foxP3+,6.143569,0.133929
7,clusterPost2,T-cells foxP3+,6.143569,0.133929
8,clusterPost3,T-cells foxP3+,9.417725,0.169643
9,clusterPost1,follicular B-cells,5.362213,0.058043


In [130]:
# Save Matrix
enrichr_data.to_csv(f"{output_dir}ILC2_enrichr_data.txt",
                   sep = '\t',
                   index = False,)