# ChicagoData detailed walkthrough

## Imports and Functions

In [1]:
import pandas as pd

In [9]:
class ChicagoData(object):
    """Import CHICACO data
    """
    def __init__(self,
                 filename: str,
                 dropna: bool = True,
                 drop_off_target: bool = True,
                 drop_trans_chrom: bool = True,
                 score_col: str = None,
                 score_val: int = 5
                 ):
        """Initialize the object

        Args:
            filename (str): Input CHICAGO txt file
            dropna (bool, optional): Drop the interactions with NA. Defaults to =True.
            drop_off_target (bool, optional): Drop off target interactions. Defaults to =True.
            drop_trans_chrom (bool, optional): Drop transchromosomal interactions. Defaults to =True.
        """
        # Set filename to the input filename
        self.filename = filename
        # Set whether to drop na interactions
        self.dropna = dropna
        # Set whether to drop off target
        self.drop_off_target = drop_off_target
        # Set whether to drop transchromosomal interactions
        self.drop_trans_chrom = drop_trans_chrom
        # Score column name for filtering
        self.score_col = score_col
        # Score value threshold
        self.score_val = score_val
        
        # Read file into DF
        self._read_file_()
        
        # Format the DF
        self._format_file_()
        
        # Filter the formatted DF
        self._filter_file_()

        # Get the PIR df
        self._get_PIR_df_()
        
    def _read_file_(self):
        """Read in original file
        """
        # Read in original file and save
        self.input_df =  pd.read_csv(self.filename, sep="\t", header=0, low_memory=False)
    
    def _format_file_(self):
        """Format CHICAGO file
        """
        # Create a copy of the raw input to be manipulated
        df = self.input_df.copy()
        
        # Format the chromosome names
        df["baitChr"] = "chr" + df["baitChr"].apply(str)
        df["oeChr"] = "chr" + df["oeChr"].apply(str)
        
        # Create an ID column that can be used to track the intervals
        df["ID"] = df["baitChr"] + ":" + \
                   df["baitStart"].apply(str) + "-" + \
                   df["baitEnd"].apply(str) + "_" + \
                   df["oeChr"] + ":" + \
                   df["oeStart"].apply(str) + "-" + \
                   df["oeEnd"].apply(str)

        # Set the variable to the formatted df
        self.df = df
    
    def _filter_file_(self):
        """Filter the formatted CHICAGO results
        """
        # There should not be any NA values in this column, but we will double check
        if self.dropna:
            self.df.dropna(subset=["dist"], inplace=True)

        # Drop the off target baits
        if self.drop_off_target_bait:
            self.df[self.df["baitName"] != "off_target"]

        # Drop the off target OE names
        if self.drop_off_target_oe:
            self.df[self.df["oeName"] != "off_target"]

        # Drop the trans chromosomal interactions
        if self.drop_trans_chrom:
            self.df = self.df[self.df["baitChr"] == self.df["oeChr"]]

        # Filter the specific score column by a specific value
        if self.score_col:
            self.df = self.df[self.df[self.score_col] >= self.score_val]
            
    def _get_PIR_df_(self):
        """Get a DF of all PIR interactions
        """
        self.pir_df = self.df[["oeChr", "oeStart", "oeEnd", "OE_width"]].drop_duplicates(subset=["oeChr", "oeStart", "oeEnd"], keep="first")

    def write_PIR_bed(self, output_filename): 
        """Write PIRs from the filtered CHICAGO results to a bed file
        """       
        self.pir_df.to_csv(output_filename, sep="\t", header=False, index=False)


## Walkthrough

We will use the ILC_5kb file as the basis for this example. This file is representative of the data files that we will be working with when we get CHiCAGO data. 

In [17]:
test_input = "/Users/caz3so/scratch/20220606_spivakov_pchic_reanalysis/TransferXL-089FGscZhgKG8/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt"

###  self._format_file_()
This first method will import the file and do some initial parsing.        

In [25]:
df = pd.read_table(test_input, sep="\t")

  df = pd.read_table(test_input, sep="\t")


#### Column Description
* `baitChr`: The chromosome that the bait is located on
* `baitStart`: The start position of the bait
* `baitEnd`: The end position of the bait
* `baitID`: The bait ID
* `baitName`: The bait name
* `oeChr`: The other end chromosome
* `oeStart`: The other end start position
* `oeEnd`: The other end End positions
* `oeID`: The other end ID
* `oeName`: The other end name
* `dist`: distance between bait and other end
* `ABC.Score`: ABC model scores
* `score`: Chicago score
* `merged_score`: Composite score for Chicago and ABC models
* `TargetGene`: The targeted gene
* `remove_line`: Ignore

In [26]:
df.columns

Index(['baitChr', 'baitStart', 'baitEnd', 'baitID', 'baitName', 'oeChr',
       'oeStart', 'oeEnd', 'oeID', 'oeName', 'dist', 'ABC.Score', 'score',
       'merged_score', 'TargetGene', 'remove_line'],
      dtype='object')

In [27]:
df.head()

Unnamed: 0,baitChr,baitStart,baitEnd,baitID,baitName,oeChr,oeStart,oeEnd,oeID,oeName,dist,ABC.Score,score,merged_score,TargetGene,remove_line
0,1,28115,29860,5,WASH7P,1,1,11159,1,.,-23407.5,0.073499,0.0,5.0,WASH7P,False
1,1,924192,925743,152,SAMD11,1,916863,923071,149,.,-5000.5,0.041623,0.0,5.0,SAMD11,False
2,1,924192,925743,152,SAMD11,1,916863,923071,149,.,-5000.5,0.049836,0.0,5.0,SAMD11,False
3,1,924192,925743,152,SAMD11,1,929679,932076,155,SAMD11,5910.0,0.049944,0.0,5.0,SAMD11,False
4,1,924192,925743,152,SAMD11,1,939433,944716,159,.,17107.0,0.033046,0.0,5.0,SAMD11,False


We start by reformatting the columns. The chromosome string is added to the `baitChr` and `oeChr` columns. 

In [28]:
# Format the chromosome names
df["baitChr"] = "chr" + df["baitChr"].apply(str)
df["oeChr"] = "chr" + df["oeChr"].apply(str)


In [29]:
df["oeChr"][0]

'chr1'

A column is then created to track each interaction pair. 

In [30]:
# Create an ID column that can be used to track the intervals
df["ID"] = df["baitChr"] + ":" + \
            df["baitStart"].apply(str) + "-" + \
            df["baitEnd"].apply(str) + "_" + \
            df["oeChr"] + ":" + \
            df["oeStart"].apply(str) + "-" + \
            df["oeEnd"].apply(str)


In [31]:
df["ID"][0]

'chr1:28115-29860_chr1:1-11159'

### self._filter_file_()

This method will perform the filtering of the file based on the input settings to the class object.

In [34]:
df.shape

(84682, 17)

In [45]:
df.dropna(subset=["dist"]).shape

(84682, 17)

In [38]:
df[df["oeName"] != "off_target"].shape

(84675, 17)

In [37]:
df[df["baitName"] != "off_target"].shape

(84682, 17)

In [40]:
df[df["baitChr"] == df["oeChr"]].shape

(84273, 17)

In [44]:
df[df["merged_score"] >= 5].shape

(84682, 17)

If we want drop promoter to promoter interactions we must remove interactions that have names in both the `baitName` and the `oeName` columns. 

In [54]:
df[df.oeName == "."].shape

(82420, 17)

In [55]:
df[df.oeName != "."].shape

(2262, 17)

In [None]:
# There should not be any NA values in this column, but we will double check
if self.dropna:
    self.df.dropna(subset=["dist"], inplace=True)

# Drop the off target baits
if self.drop_off_target_bait:
    self.df[self.df["baitName"] != "off_target"]

if self.drop_off_target_oe:
    self.df[self.df["oeName"] != "off_target"]

if self.drop_trans_chrom:
    self.df = self.df[self.df["baitChr"] == self.df["oeChr"]]

if self.score_col:
    self.df = self.df[self.df[self.score_col] >= 5]
