# Analysis of pcHi-C data from human CD4+ T cells and ILC3 cells

This notebook will walk through the analysis of pcHi-C data generated by Mikhail Spivakov's group. We will also analyze CD4+ alpha-beta T-cells in addition to the ILC3 data as a positive control sample.

## Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from decimal import Decimal
import matplotlib.ticker as ticker


In [2]:
class ChicagoData(object):
    """Import CHiCAGO data
    """
    def __init__(self,
                 filename: str,
                 drop_off_target_bait: bool = True,
                 drop_off_target_oe: bool = True,
                 drop_trans_chrom: bool = True,
                 score_col: str = None,
                 score_val: int = 5,
                 remove_p2p: bool= True
                 ):
        """Initialize the object

        Args:
            filename (str): Input CHICAGO txt file
            drop_off_target (bool, optional): Drop off target interactions. Defaults to =True.
            drop_trans_chrom (bool, optional): Drop transchromosomal interactions. Defaults to =True.
        """
        # Set filename to the input filename
        self.filename = filename
        # Set whether to drop off target baits
        self.drop_off_target_bait = drop_off_target_bait
        # Set whether to drop off target oe
        self.drop_off_target_oe = drop_off_target_oe
        # Set whether to drop transchromosomal interactions
        self.drop_trans_chrom = drop_trans_chrom
        # Score column name for filtering
        self.score_col = score_col
        # Score value threshold
        self.score_val = score_val
        # Remove promoter to promoter interactions
        self.remove_p2p = remove_p2p

        # Read file into DF
        self._read_file_()
        
        # Format the DF
        self._format_file_()
        
        # Filter the formatted DF
        self._filter_file_()
        
        # Get the PIR df
        self._get_PIR_df_()
        
        # Get the bait df
        self._get_bait_df_()
        
        # Get the combined df
        self._get_combined_df_()    
        
    def _read_file_(self):
        """Read in original file
        """
        # Read in original file and save
        self.input_df =  pd.read_csv(self.filename, sep="\t", header=0, low_memory=False)
    
    def _format_file_(self):
        """Format CHICAGO file
        """
        # Create a copy of the raw input to be manipulated
        df = self.input_df.copy()
        
        # Format the chromosome names
        df["baitChr"] = "chr" + df["baitChr"].apply(str)
        df["oeChr"] = "chr" + df["oeChr"].apply(str)
        
        df["oe_interval_ID"] = df["oeChr"] + ":" + \
                   df["oeStart"].apply(str) + "-" + \
                   df["oeEnd"].apply(str)

        df["bait_interval_ID"] = df["baitChr"] + ":" + \
                   df["baitStart"].apply(str) + "-" + \
                   df["baitEnd"].apply(str)

        df["interaction_ID"] = df["bait_interval_ID"] + "_" + df["oe_interval_ID"].apply(str)
        
        # Set the variable to the formatted df
        self.bait_ID = df["baitID"].unique()
        
        self.oe_ID = df["oeID"].unique()

        self.bait_interval_ID = df["bait_interval_ID"].unique()
        
        self.oe_interval_ID = df["oe_interval_ID"].unique()

        self.df = df

    def _filter_file_(self):
        """Filter the formatted CHICAGO results
        """
        # Drop the off target baits
        if self.drop_off_target_bait:
            self.df[self.df["baitName"] != "off_target"]

        # Drop the off target OE names
        if self.drop_off_target_oe:
            self.df[self.df["oeName"] != "off_target"]

        # Drop the trans chromosomal interactions
        if self.drop_trans_chrom:
            self.df = self.df[self.df["dist"] != "."]
                        
            self.df = self.df[self.df["dist"].apply(float) != 0]

            self.df = self.df.dropna(subset=["dist"])

            self.df = self.df[self.df["baitChr"] == self.df["oeChr"]]

        # Filter the specific score column by a specific value
        if self.score_col:
            self.df = self.df[self.df[self.score_col] >= self.score_val]
            
        # Drop promoter to promoter interactions
        if self.remove_p2p:
            self.df = self.df[self.df.oeName == "."]
            
            self.df = self.df[~self.df.oe_interval_ID.isin(self.bait_interval_ID)]
        
    def _get_PIR_df_(self):
        """Get a DF of all PIR interactions
        """
        self.pir_df = self.df[["oeChr", "oeStart", "oeEnd", "oe_interval_ID"]].drop_duplicates(subset=["oeChr", "oeStart", "oeEnd"], keep="first")
        
    def _get_bait_df_(self):
        """Get a DF of baits
        """
        self.bait_df = self.df[["baitChr", "baitStart", "baitEnd", "bait_interval_ID"]].drop_duplicates(subset=["baitChr", "baitStart", "baitEnd"], keep="first")
        
    def _get_combined_df_(self):
        """Get a comined DF
        """
        tmp_df = self.pir_df.copy()
        tmp_df2 = self.bait_df.copy()

        tmp_df.columns = ["Chr", "Start", "Stop", "ID"]
        tmp_df2.columns = ["Chr", "Start", "Stop", "ID"]

        self.unique_features = pd.concat([tmp_df, tmp_df2])


In [3]:
def counts_dict(input_filename: str):
    """Create a dictionary of Feature IDs to counts

    Args:
        input_filename (str): Input filename

    Returns:
        dict: Dictionary mapping Feature IDs to counts
    """
    counts_df = pd.read_csv(input_filename, sep="\t", header=None, names=["chr", "start", "stop", "ID", "counts"])
            
    counts_dict =  pd.Series(counts_df["counts"].values,index=counts_df["ID"]).to_dict()

    return counts_dict

In [84]:
class GeneExpressionAnalysis(object):
    def __init__(self,
                 df: pd.DataFrame,
                feature_col: str,
                mean_col: str,
                title: str = "",
                xlabel: str = "",
                log: bool = True,
                output_filename: str = "",
                non_zero: bool = True, 
                drop_na: bool = True
                ):
        """Gene Expression Analysis of pcHiC associated feature counts

        Args:
            df (pd.DataFrame): _description_
            feature_col (str): _description_
            mean_col (str): _description_
            title (str): _description_
            xlabel (str): _description_
            log (bool): _description_
            plot (bool): _description_
            plot_filename (str): _description_
        """
        self.df = df
        self.feature_col = feature_col
        self.mean_col = mean_col
        self.log = log
        self.output_filename = output_filename
        self.nonzero = non_zero
        self.dropna = drop_na
        self.xlabel = xlabel
        self.title = title
        
        # Filter the dataframe based on any specific criteria
        self._filter_df_()
        
        # Calculate the spearman correlation of the the mean gene expression and the feature counts column
        self._calculate_spearman_()
        
        self._get_PIR_count_v_mean_()
        
        #self.plot_expression_analysis()
        
    def _filter_df_(self):
        if self.nonzero:
            self.df = self.df[self.df[self.mean_col] > 0]

        if self.dropna:
            self.df = self.df[~self.df[self.feature_col].isna()]
            self.df = self.df[~self.df[self.mean_col].isna()]

        self.df["GeneName_MeanExpression"] = self.df["GeneName"] \
            + " " + self.df[self.mean_col].apply(str)
            
    def _calculate_spearman_(self):
        tmp_df = self.df[[self.feature_col, self.mean_col]].copy()
        
        self.corr, self.pval = spearmanr(tmp_df[self.feature_col], tmp_df[self.mean_col])
    
    def _get_PIR_count_v_mean_(self):
        """Create the dataframe of the number of features overlapping PIRs by mean gene expression
        
        First you groupby the feature counts column. Then you find all of the mean gene expression values
        associated with the number of features overlapping a PIR. The output is a dataframe that can be plotted
        """
        # Create the dataframe
        self.pir_count_v_mean = pd.DataFrame(self.df.groupby([self.feature_col])["GeneName_MeanExpression"].apply(list)).reset_index().explode("GeneName_MeanExpression")
    
        self.pir_count_v_mean[self.feature_col] = self.pir_count_v_mean[self.feature_col].apply(int)
        
        self.pir_count_v_mean[["Gene_Name", "Mean_Gene_Expression"]] = self.pir_count_v_mean["GeneName_MeanExpression"].str.split(" ", n=2, expand=True)
        
        self.pir_count_v_mean.drop("GeneName_MeanExpression",axis=1, inplace=True)
        
        self.pir_count_v_mean["Mean_Gene_Expression"] = self.pir_count_v_mean["Mean_Gene_Expression"].apply(float)
        
        self.pir_count_v_mean.to_csv(self.output_filename + ".tsv", sep="\t", index=False)



## Getting PIRs From CHiCAGO output file

I wrote a python class object to work with the CHiCAGO output text file. This piece of code will perform filtering of specific types of interactions, like promoter-to-promoter, or trans-chromosomal interactions. 

Example:

```python
input_file = "../data/CHICAGO/hg38/inputs/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt"

ILC3_data = ChicagoData(input_file)

ILC3_data = ChicagoData(input_file, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)
                        
ILC3_data.pir_df
```

In [5]:
# Input Chicago File
input_file = "../data/CHICAGO/hg38/inputs/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt.gz"

In [6]:
ILC3_data = ChicagoData(input_file, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)

In [7]:
ILC3_data.pir_df

Unnamed: 0,oeChr,oeStart,oeEnd,oe_interval_ID
0,chr1,1,11159,chr1:1-11159
1,chr1,916863,923071,chr1:916863-923071
4,chr1,939433,944716,chr1:939433-944716
5,chr1,972542,978821,chr1:972542-978821
8,chr1,989911,995125,chr1:989911-995125
...,...,...,...,...
84672,chrY,13457330,13463420,chrY:13457330-13463420
84673,chrY,13468452,13473885,chrY:13468452-13473885
84674,chrY,13473886,13478002,chrY:13473886-13478002
84675,chrY,13481458,13487154,chrY:13481458-13487154


## Processing Files

In [8]:
pwd

'/Users/caz3so/workspaces/tacazares/pchic/notebooks'

In [9]:
ilc3_file = "../data/CHICAGO/hg38/inputs/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt.gz"
cd4_file = "../data/CHICAGO/hg38/inputs/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm.txt.gz"
cd4_ABC = "../data/CHICAGO/hg38/inputs/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt.gz"

In [10]:
out_dir = "../data/CHICAGO/hg38/PIR"

In [11]:
# Import ILC3 data
ilc3 = ChicagoData(ilc3_file, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)

# Import cd4 file CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm.txt
cd4 = ChicagoData(cd4_file, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)

# Import cd4_abc file CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt
cd4_abc = ChicagoData(cd4_ABC, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)

### Write PIR to files

In [12]:
#ilc3.pir_df.to_csv(os.path.join(out_dir, "ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_PIR.bed.gz"), sep="\t", header=False, index=False)
#cd4.pir_df.to_csv(os.path.join(out_dir, "CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_PIR.bed.gz"), sep="\t", header=False, index=False)
#cd4_abc.pir_df.to_csv(os.path.join(out_dir, "CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_PIR.bed.gz"), sep="\t", header=False, index=False)

In [13]:
#ilc3.unique_features.to_csv(os.path.join(out_dir, "ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_PIRwPromoters.bed.gz"), sep="\t", header=False, index=False)
#cd4.unique_features.to_csv(os.path.join(out_dir, "CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_PIRwPromoters.bed.gz"), sep="\t", header=False, index=False)
#cd4_abc.unique_features.to_csv(os.path.join(out_dir, "CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_PIRwPromoters.bed.gz"), sep="\t", header=False, index=False)

## Expression analysis

### Intersect PIR with genetic features

I used `bedtools intersect -c` to count the number of features that overlapped PIR intervals. 

```bash
for PIR_BED in /Users/caz3so/workspaces/tacazares/pchic/data/CHICAGO/hg38/PIR/CD4*bed.gz;
    do
        bedtools intersect -a ${PIR_BED} -b ../data/ATAC/CD4_ATAC_peaks.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapATAC.bed
        bedtools intersect -a ${PIR_BED} -b ../data/CHIP/S008H1H1.ERX547940.H3K27ac.bwa.GRCh38.20150527.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapH3K27ac.bed
        bedtools intersect -a ${PIR_BED} -b ../data/CHIP/S008H1H1.ERX547958.H3K4me3.bwa.GRCh38.20150527.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapH3K4me3.bed
        bedtools intersect -a ${PIR_BED} -b ../data/RE/CD4_RE.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapRE.bed
    done

for PIR_BED in /Users/caz3so/workspaces/tacazares/pchic/data/CHICAGO/hg38/PIR/ILC*bed.gz;
    do
        bedtools intersect -a ${PIR_BED} -b ../data/ATAC/ILC3_ATAC_peaks.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapATAC.bed
        bedtools intersect -a ${PIR_BED} -b ../data/CHIP/ILC3_H3K27ac_peaks.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapH3K27ac.bed
        bedtools intersect -a ${PIR_BED} -b ../data/CHIP/ILC3_H3K4me3_peaks.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapH3K4me3.bed
        bedtools intersect -a ${PIR_BED} -b ../data/RE/ILC3_RE.bed -c > ../data/PIR_overlap/`basename ${PIR_BED} .bed.gz`_overlapRE.bed
    done
    
```

### Import feature counts as a dictionary

This part of the code will import the feature counts as a dictionary that can be used to map counts to features.

In [14]:
#ILC3
ILC3_bins_H3K27ac_dict = counts_dict("../data/PIR_overlap/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapH3K27ac.bed")
ILC3_bins_H3K4me3_dict = counts_dict("../data/PIR_overlap/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapH3K4me3.bed")
ILC3_bins_ATAC_dict = counts_dict("../data/PIR_overlap/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapATAC.bed")
ILC3_bins_RE_dict = counts_dict("../data/PIR_overlap/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapRE.bed")

#CD4 ABC
CD4_ABC_filtered_H3K27ac_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapH3K27ac.bed")
CD4_ABC_filtered_H3K4me3_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapH3K4me3.bed")
CD4_ABC_filtered_PIR_ATAC_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapATAC.bed")
CD4_ABC_filtered_PIR_RE_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_filtered_PIR_overlapRE.bed")

#CD4 no ABC
CD4_filtered_H3K27ac_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_filtered_PIR_overlapH3K27ac.bed")
CD4_filtered_H3K4me3_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_filtered_PIR_overlapH3K4me3.bed")
CD4_filtered_ATAC_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_filtered_PIR_overlapATAC.bed")
CD4_filtered_RE_dict = counts_dict("../data/PIR_overlap/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_peakm_filtered_PIR_overlapRE.bed")

### Import gene counts

This section will import the gene expression data and find the mean per group. The meta data for the files used can be found in [`../data/RNA/meta/20200707_CD4_ILC3_meta.txt`](../data/RNA/meta/20200707_CD4_ILC3_meta.txt).

In [15]:
GENE_COUNTS = pd.read_csv("../data/RNA/counts/20200706_hILC3_CD4_GeneCounts.tsv", sep="\t", header=0)

GENE_COUNTS["ILC3_mean"] = GENE_COUNTS[["SRX5797708", "SRX5797709", "SRX5797712", "SRX5797713", "SRX5797716", "SRX5797717"]].mean(axis=1)

GENE_COUNTS["CD4_mean"] = GENE_COUNTS[["SRR4290846", "SRR4290852", "SRR4290853", "SRR4290854"]].mean(axis=1)

### Map counts to interactions

This section will map the number of features that overlap the PIRs. A column is created that is specific for each feature. 

In [16]:
# ILC
ilc3.df["H3K27ac_counts"] = ilc3.df["oe_interval_ID"].map(ILC3_bins_H3K27ac_dict)
ilc3.df["H3K4me3_counts"] = ilc3.df["oe_interval_ID"].map(ILC3_bins_H3K4me3_dict)
ilc3.df["ATAC_counts"] = ilc3.df["oe_interval_ID"].map(ILC3_bins_ATAC_dict)
ilc3.df["RE_counts"] = ilc3.df["oe_interval_ID"].map(ILC3_bins_RE_dict)

# CD4
cd4.df["H3K27ac_counts"] = cd4.df["oe_interval_ID"].map(CD4_filtered_H3K27ac_dict)
cd4.df["H3K4me3_counts"] = cd4.df["oe_interval_ID"].map(CD4_filtered_H3K4me3_dict)
cd4.df["ATAC_counts"] = cd4.df["oe_interval_ID"].map(CD4_filtered_ATAC_dict)
cd4.df["RE_counts"] = cd4.df["oe_interval_ID"].map(CD4_filtered_RE_dict)

# CD4 ABC
cd4_abc.df["H3K27ac_counts"] = cd4_abc.df["oe_interval_ID"].map(CD4_ABC_filtered_H3K27ac_dict)
cd4_abc.df["H3K4me3_counts"] = cd4_abc.df["oe_interval_ID"].map(CD4_ABC_filtered_H3K4me3_dict)
cd4_abc.df["ATAC_counts"] = cd4_abc.df["oe_interval_ID"].map(CD4_ABC_filtered_PIR_ATAC_dict)
cd4_abc.df["RE_counts"] = cd4_abc.df["oe_interval_ID"].map(CD4_ABC_filtered_PIR_RE_dict)


### Map Feature Counts to Genes

We will then map the different feature counts to the genes that we are interested in. We will create new columns in the gene expression dataframe that contain this information. 

* Enhancer counts are the number of PIRs that overlap the Gene. This uses the count function. 
* The H3K27ac, H3K4me3, ATAC, and RE features are summed up across PIRs. 

In [17]:
GENE_COUNTS["ILC3_enhancer_count_bins"] = GENE_COUNTS["GeneName"].map(ilc3.df.groupby(["baitName"]).count()["baitChr"])
GENE_COUNTS["ILC3_H3K27ac_count_bins"] = GENE_COUNTS["GeneName"].map(ilc3.df.groupby(["baitName"]).sum()["H3K27ac_counts"])
GENE_COUNTS["ILC3_H3K4me3_count_bins"] = GENE_COUNTS["GeneName"].map(ilc3.df.groupby(["baitName"]).sum()["H3K4me3_counts"])
GENE_COUNTS["ILC3_ATAC_count_bins"] = GENE_COUNTS["GeneName"].map(ilc3.df.groupby(["baitName"]).sum()["ATAC_counts"])
GENE_COUNTS["ILC3_RE_count_bins"] = GENE_COUNTS["GeneName"].map(ilc3.df.groupby(["baitName"]).sum()["RE_counts"])

GENE_COUNTS["CD4_enhancer_counts_bins"] = GENE_COUNTS["GeneName"].map(cd4.df.groupby(["baitName"]).count()["baitChr"])
GENE_COUNTS["CD4_H3K27ac_count_bins"] = GENE_COUNTS["GeneName"].map(cd4.df.groupby(["baitName"]).sum()["H3K27ac_counts"])
GENE_COUNTS["CD4_H3K4me3_count_bins"] = GENE_COUNTS["GeneName"].map(cd4.df.groupby(["baitName"]).sum()["H3K4me3_counts"])
GENE_COUNTS["CD4_ATAC_count_bins"] = GENE_COUNTS["GeneName"].map(cd4.df.groupby(["baitName"]).sum()["ATAC_counts"])
GENE_COUNTS["CD4_RE_count_bins"] = GENE_COUNTS["GeneName"].map(cd4.df.groupby(["baitName"]).sum()["RE_counts"])

GENE_COUNTS["CD4_ABC_enhancer_count_bins"] = GENE_COUNTS["GeneName"].map(cd4_abc.df.groupby(["baitName"]).count()["baitChr"])
GENE_COUNTS["CD4_ABC_H3K27ac_count_bins"] = GENE_COUNTS["GeneName"].map(cd4_abc.df.groupby(["baitName"]).sum()["H3K27ac_counts"])
GENE_COUNTS["CD4_ABC_H3K4me3_count_bins"] = GENE_COUNTS["GeneName"].map(cd4_abc.df.groupby(["baitName"]).sum()["H3K4me3_counts"])
GENE_COUNTS["CD4_ABC_ATAC_count_bins"] = GENE_COUNTS["GeneName"].map(cd4_abc.df.groupby(["baitName"]).sum()["ATAC_counts"])
GENE_COUNTS["CD4_ABC_RE_count_bins"] = GENE_COUNTS["GeneName"].map(cd4_abc.df.groupby(["baitName"]).sum()["RE_counts"])


In [18]:
GENE_COUNTS

Unnamed: 0,GeneName,SRX5797708,SRX5797709,SRX5797712,SRX5797713,SRX5797716,SRX5797717,SRR4290846,SRR4290852,SRR4290853,...,CD4_enhancer_counts_bins,CD4_H3K27ac_count_bins,CD4_H3K4me3_count_bins,CD4_ATAC_count_bins,CD4_RE_count_bins,CD4_ABC_enhancer_count_bins,CD4_ABC_H3K27ac_count_bins,CD4_ABC_H3K4me3_count_bins,CD4_ABC_ATAC_count_bins,CD4_ABC_RE_count_bins
0,A1BG,876,508,575,276,222,387,774,777,474,...,,,,,,2.0,0.0,1.0,3.0,3.0
1,A1CF,1,8,5,6,4,7,36,31,38,...,3.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,6.0,6.0
2,A2M,20,50,44,66,0,21,1122,638,172,...,,,,,,2.0,1.0,0.0,2.0,2.0
3,A2ML1,37,84,76,102,49,65,80,90,78,...,2.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,4.0,4.0
4,A2MP1,0,21,1,4,0,3,273,168,11,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22454,ZXDC,232,354,129,156,485,264,885,772,622,...,,,,,,,,,,
22455,ZYG11A,71,179,89,141,76,148,306,275,232,...,,,,,,2.0,1.0,1.0,7.0,6.0
22456,ZYG11B,65,908,577,719,84,440,2231,1996,1426,...,2.0,0.0,1.0,2.0,2.0,5.0,2.0,2.0,13.0,11.0
22457,ZYX,513,703,180,452,196,501,1645,1788,1678,...,,,,,,6.0,0.0,0.0,10.0,10.0


### Plot Gene Expression Analysis

#### ILC3

In [87]:
ILC3_expression_atac = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="ILC3_ATAC_count_bins", 
                                                               mean_col="ILC3_mean",
                                                               output_filename="../data/expression_PIR_analysis/ILC3_mean_expression_PIRwATAC")

In [88]:
ILC3_expression_acety = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="ILC3_H3K27ac_count_bins", 
                                                               mean_col="ILC3_mean",
                                                               output_filename="../data/expression_PIR_analysis/ILC3_mean_expression_PIRwH3K27ac")

In [89]:
ILC3_expression_methyl = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="ILC3_H3K4me3_count_bins", 
                                                               mean_col="ILC3_mean",
                                                               output_filename="../data/expression_PIR_analysis/ILC3_mean_expression_PIRwH3K4me3")

In [90]:
ILC3_expression_enhancer = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="ILC3_ATAC_count_bins", 
                                                               mean_col="ILC3_mean",
                                                               output_filename="../data/expression_PIR_analysis/ILC3_mean_expression_PIRwEnhancer")

#### CD4

In [91]:
cd4_expression_atac = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_ATAC_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_mean_expression_PIRwATAC")

In [92]:
cd4_expression_acety = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_H3K27ac_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_mean_expression_PIRwH3K27ac")

In [93]:
cd4_expression_methyl = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_H3K4me3_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_mean_expression_PIRwH3K4me3")

In [94]:
cd4_expression_enhancer = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_enhancer_counts_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_mean_expression_PIRwEnhancer")

#### CD4 ABC

In [95]:
cd4_abc_expression_atac = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_ABC_ATAC_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_ABC_mean_expression_PIRwATAC")

In [96]:
cd4_abc_expression_acety = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_ABC_H3K27ac_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_ABC_mean_expression_PIRwH3K27ac")

In [97]:
cd4_abc_expression_methyl = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_ABC_H3K4me3_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_ABC_mean_expression_PIRwH3K4me3")

In [99]:
cd4_abc_expression_enhancer = GeneExpressionAnalysis(GENE_COUNTS, 
                                                               feature_col="CD4_ABC_enhancer_count_bins", 
                                                               mean_col="CD4_mean",
                                                               output_filename="../data/expression_PIR_analysis/CD4_ABC_mean_expression_PIRwEnhancer")