# Analysis of pcHi-C data from human CD4+ T cells and ILC3 cells

This notebook will walk through the analysis of pcHi-C data generated by Mikhail Spivakov's group. We will also analyze CD4+ alpha-beta T-cells in addition to the ILC3 data as a positive control sample.

In [306]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [372]:
class ChicagoData(object):
    """Import CHICACO data
    """
    def __init__(self,
                 filename: str,
                 drop_off_target_bait: bool = True,
                 drop_off_target_oe: bool = True,
                 drop_trans_chrom: bool = True,
                 score_col: str = None,
                 score_val: int = 5,
                 remove_p2p: bool= True
                 ):
        """Initialize the object

        Args:
            filename (str): Input CHICAGO txt file
            drop_off_target (bool, optional): Drop off target interactions. Defaults to =True.
            drop_trans_chrom (bool, optional): Drop transchromosomal interactions. Defaults to =True.
        """
        # Set filename to the input filename
        self.filename = filename
        # Set whether to drop off target baits
        self.drop_off_target_bait = drop_off_target_bait
        # Set whether to drop off target oe
        self.drop_off_target_oe = drop_off_target_oe
        # Set whether to drop transchromosomal interactions
        self.drop_trans_chrom = drop_trans_chrom
        # Score column name for filtering
        self.score_col = score_col
        # Score value threshold
        self.score_val = score_val
        # Remove promoter to promoter interactions
        self.remove_p2p = remove_p2p
        
        # Read file into DF
        self._read_file_()
        
        # Format the DF
        self._format_file_()
        
        # Filter the formatted DF
        self._filter_file_()
        
        # Get the PIR df
        self._get_PIR_df_()
        
        # Get the bait df
        self._get_bait_df_()
        
        # Get the combined df
        self._get_combined_df_()
        
    def _read_file_(self):
        """Read in original file
        """
        # Read in original file and save
        self.input_df =  pd.read_csv(self.filename, sep="\t", header=0, low_memory=False)
    
    def _format_file_(self):
        """Format CHICAGO file
        """
        # Create a copy of the raw input to be manipulated
        df = self.input_df.copy()
        
        # Format the chromosome names
        df["baitChr"] = "chr" + df["baitChr"].apply(str)
        df["oeChr"] = "chr" + df["oeChr"].apply(str)
        
        df["oe_interval_ID"] = df["oeChr"] + ":" + \
                   df["oeStart"].apply(str) + "-" + \
                   df["oeEnd"].apply(str)

        df["bait_interval_ID"] = df["baitChr"] + ":" + \
                   df["baitStart"].apply(str) + "-" + \
                   df["baitEnd"].apply(str)

        df["interaction_ID"] = df["bait_interval_ID"] + "_" + df["oe_interval_ID"].apply(str)
        
        # Set the variable to the formatted df
        self.bait_ID = df["baitID"].unique()
        
        self.bait_interval_ID = df["bait_interval_ID"].unique()
        
        self.oe_interval_ID = df["oe_interval_ID"].unique()

        self.df = df

    def _filter_file_(self):
        """Filter the formatted CHICAGO results
        """
        # Drop the off target baits
        if self.drop_off_target_bait:
            self.df[self.df["baitName"] != "off_target"]

        # Drop the off target OE names
        if self.drop_off_target_oe:
            self.df[self.df["oeName"] != "off_target"]

        # Drop the trans chromosomal interactions
        if self.drop_trans_chrom:
            self.df = self.df[self.df["dist"] != "."]
                        
            self.df = self.df[self.df["dist"].apply(float) != 0]

            self.df = self.df.dropna(subset=["dist"])

            self.df = self.df[self.df["baitChr"] == self.df["oeChr"]]

        # Filter the specific score column by a specific value
        if self.score_col:
            self.df = self.df[self.df[self.score_col] >= self.score_val]
            
        # Drop promoter to promoter interactions
        if self.remove_p2p:
            self.df = self.df[self.df.oeName == "."]
        
    def _get_PIR_df_(self):
        """Get a DF of all PIR interactions
        """
        self.pir_df = self.df[["oeChr", "oeStart", "oeEnd", "oe_interval_ID"]].drop_duplicates(subset=["oeChr", "oeStart", "oeEnd"], keep="first")
        
    def _get_bait_df_(self):
        """Get a DF of baits
        """
        self.bait_df = self.df[["baitChr", "baitStart", "baitEnd", "bait_interval_ID"]].drop_duplicates(subset=["baitChr", "baitStart", "baitEnd"], keep="first")
        
    def _get_combined_df_(self):
        """Get a comined DF
        """
        tmp_df = self.pir_df.copy()
        tmp_df2 = self.bait_df.copy()

        tmp_df.columns = ["Chr", "Start", "Stop", "ID"]
        tmp_df2.columns = ["Chr", "Start", "Stop", "ID"]

        self.unique_features = pd.concat([tmp_df, tmp_df2])

## Getting PIRs From CHiCAGO output file

I wrote a python class object to work with the CHiCAGO output text file. This piece of code will perform filtering of specific types of interactions, like promoter-to-promoter, or trans-chromosomal interactions. 

Example:

```python
input_file = "/Users/caz3so/scratch/20220606_spivakov_pchic_reanalysis/TransferXL-089FGscZhgKG8/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt"

ILC3_data = ChicagoData(input_file)

ILC3_data.df(input_file, dropna=True, drop_off_target=True, drop_p2p=False, drop_trans_chrom=True)

ILC3_data.pir_df
```

In [373]:
# Input Chicago File
input_file = "/Users/caz3so/scratch/20220606_spivakov_pchic_reanalysis/TransferXL-089FGscZhgKG8/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt"

In [374]:
ILC3_data = ChicagoData(input_file, 
                        drop_off_target_bait=True, 
                        drop_off_target_oe=False, 
                        drop_trans_chrom=True,
                        score_col="merged_score",
                        score_val=5,
                        remove_p2p=True)

In [375]:
ILC3_data.pir_df[ILC3_data.pir_df["ID"].isin(ILC3_data.bait_intervals)]

KeyError: 'Promoter_ID'

In [350]:
bait_df = ILC3_data.df[ILC3_data.df["oeID"].isin(bait_list)]

In [351]:
bait_list

array([     5,    152,    169, ..., 577467, 577469, 578920])

In [352]:
promoter_df = ILC3_data.df[ILC3_data.df["Promoter_ID"].isin(promoter_list)]

In [353]:
ILC3_data.df[ILC3_data.df["Promoter_ID"].isin(promoter_list)]

Unnamed: 0,baitChr,baitStart,baitEnd,baitID,baitName,oeChr,oeStart,oeEnd,oeID,oeName,dist,ABC.Score,score,merged_score,TargetGene,remove_line,ID,Promoter_ID
147,chr1,3795548,3796784,794,LRRC47,chr1,3623587,3628735,754,.,-170005,0.000000,5.136701,5.136701,LRRC47,False,chr1:3795548-3796784_chr1:3623587-3628735,chr1_3623587_3628735
315,chr1,9651196,9651808,1993,PIK3CD,chr1,9651809,9657236,1994,.,3020.5,0.112504,0.000000,5.000000,PIK3CD,False,chr1:9651196-9651808_chr1:9651809-9657236,chr1_9651809_9657236
620,chr1,16612691,16613430,3511,NBPF1,chr1,16719862,16725284,3532,.,109512.5,0.000000,6.171719,6.171719,,False,chr1:16612691-16613430_chr1:16719862-16725284,chr1_16719862_16725284
631,chr1,16757701,16762788,3539,LOC102724562,chr1,16762789,16767906,3540,.,5103,0.031433,0.000000,5.000000,LOC102724562,False,chr1:16757701-16762788_chr1:16762789-16767906,chr1_16762789_16767906
747,chr1,21782427,21784115,4605,USP48,chr1,22023469,22028652,4661,.,242789.5,0.000000,7.434248,7.434248,USP48,False,chr1:21782427-21784115_chr1:22023469-22028652,chr1_22023469_22028652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83811,chrX,63350917,63351798,557438,SPIN4,chrX,63556449,63561756,557477,.,207745,0.000000,7.947093,7.947093,SPIN4,False,chrX:63350917-63351798_chrX:63556449-63561756,chrX_63556449_63561756
84144,chrX,102599489,102599831,564971,ARMCX5,chrX,102649138,102651496,564983,.,50657,0.000000,5.479244,5.479244,ARMCX5,False,chrX:102599489-102599831_chrX:102649138-102651496,chrX_102649138_102651496
84432,chrX,135342948,135344099,571392,ZNF75D,chrX,135417323,135421949,571408,.,76112.5,0.000000,18.360635,18.360635,ZNF75D,False,chrX:135342948-135344099_chrX:135417323-135421949,chrX_135417323_135421949
84447,chrX,135520266,135522228,571429,INTS6L,chrX,135417323,135421949,571408,.,-101611,0.000000,6.384263,6.384263,INTS6L,False,chrX:135520266-135522228_chrX:135417323-135421949,chrX_135417323_135421949


In [337]:
ILC3_data.input_df[ILC3_data.input_df["Promoter_ID"] == "chr1_3623587_3628735"]

Unnamed: 0,baitChr,baitStart,baitEnd,baitID,baitName,oeChr,oeStart,oeEnd,oeID,oeName,dist,ABC.Score,score,merged_score,TargetGene,remove_line,Promoter_ID
147,1,3795548,3796784,794,LRRC47,1,3623587,3628735,754,.,-170005,0.0,5.136701,5.136701,LRRC47,False,chr1_3623587_3628735
