In [9]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
import pybedtools
import os

In [63]:
class ModExpressionAnalysis(object):
    """Import CHiCAGO data
    """
    def __init__(self,
                 filename: str,
                 features_to_count: dict = {},
                 gene_expression: str = "",
                 nonzero_expression: bool = True,
                 dropna_expression: bool = True,
                 output_dir: str = "",
                 output_basename: str = ""
                 ):
        """Initialize the object

        Args:
            filename (str): Input CHICAGO txt file
            drop_off_target (bool, optional): Drop off target interactions. Defaults to =True.
            drop_trans_chrom (bool, optional): Drop transchromosomal interactions. Defaults to =True.
        """
        # Set filename to the input filename
        self.filename = filename
        # Map feature counts to PIR if provided
        self.features_to_count = features_to_count
        # Import the gene expression matrix
        self.gene_expression = gene_expression
        # Only keep non-zero expression
        self.nonzero_expression = nonzero_expression
        # Drop na Values
        self.dropna_expression = dropna_expression
        # Set the output directory
        self.output_dir = output_dir
        # Set the output basename 
        self.basename = output_basename
        
        # Read file into DF
        self._read_file_()
        
        # Get the PIR df
        self._get_PIR_df_()
        
        self._get_feature_counts_()
        
        self._import_gene_counts_()
            
        self._map_feature_counts_to_genes_()
        
        self._map_ABC_counts_to_genes_()
        
        self._filter_expression_()
        
        self._get_PIR_count_v_mean_()
                            
    def _read_file_(self):
        """Read in original file
        """
        # Read in original file and save
        self.df =  pd.read_csv(self.filename, sep="\t", header=0, low_memory=False)
        
        self.df["ABC_label"] = self.df["ABC.Score"].apply(lambda x: "ABC" if x == 0 else "CHiCAGO")
    
    def _get_PIR_df_(self):
        """Get a DF of all PIR interactions
        """
        self.pir_df = self.df[["oeChr", "oeStart", "oeEnd", "oe_interval_ID"]].drop_duplicates(subset=["oeChr", "oeStart", "oeEnd"], keep="first")
        
        self.PIR_bt = pybedtools.BedTool.from_dataframe(self.pir_df)

    def _get_feature_counts_(self):
        """Get the counts of features that overlap PIRs and map them back to the pcHiC interaction
        """
        for file, tag in self.features_to_count.items():
            print(f"Importing {file} : Column will be saved as {tag}")
            output_intersection_dir = os.path.join(self.output_dir, "PIR_intersection")

            output_intersection_fname = os.path.join(output_intersection_dir, f"{self.basename}_PIR_intersect_{tag}.bed")
            
            # Import and convert the featuers to a bedtools
            feature_bt = pybedtools.BedTool(file)
            
            # Intersect the features to get counts
            feature_counts = self.PIR_bt.intersect(feature_bt, c=True)
            
            # Intersect the features to get overlaps (true intersections)
            feature_intersection = self.PIR_bt.intersect(feature_bt)
            
            feature_intersection_sort = feature_intersection.sort()
            
            # Convert bedtools to pandas
            feature_counts_df = feature_counts.to_dataframe()
                        
            # Convert bedtools to dataframe
            feature_intersection_df = feature_intersection_sort.to_dataframe()
                        
            # Create a dictionary of counts 
            counts_dict =  pd.Series(feature_counts_df["score"].values,index=feature_counts_df["name"]).to_dict()

            # Map the counts back to the CHICAGO dataframe
            self.df[tag] = self.df["oe_interval_ID"].map(counts_dict)
                        
            feature_intersection_df[["chrom", "start", "end"]].to_csv(output_intersection_fname, sep="\t", index=False, header=False)

    def _import_gene_counts_(self):
        self.gene_counts = pd.read_csv(self.gene_expression, sep="\t", header=0, names=["GeneName", "Expression"])
    
    def _map_feature_counts_to_genes_(self):        
        for _, tag in self.features_to_count.items():
            self.gene_counts[f"{tag}_count"] = self.gene_counts["GeneName"].map(self.df.groupby(["baitName"]).sum()[tag])

    def _map_ABC_counts_to_genes_(self):        
        for _, tag in self.features_to_count.items():
            tmp = self.df[self.df["ABC_label"] == "ABC"]
            
            self.gene_counts["ABC_count"] = self.gene_counts["GeneName"].map(tmp.groupby(["baitName"]).sum()[tag])

            self.gene_counts["CHiCAGO_count"] = self.gene_counts[f"{tag}_count"] - self.gene_counts["ABC_count"]

    def _filter_expression_(self):
        if self.nonzero_expression:
            self.gene_counts = self.gene_counts[self.gene_counts["Expression"] > 0]

        if self.dropna_expression:
            self.gene_counts = self.gene_counts.dropna()

        self.gene_counts["GeneName_MeanExpression"] = self.gene_counts["GeneName"] \
            + " " + self.gene_counts["Expression"].apply(str)
        
    def _get_PIR_count_v_mean_(self):
        """Create the dataframe of the number of features overlapping PIRs by mean gene expression
        
        First you groupby the feature counts column. Then you find all of the mean gene expression values
        associated with the number of features overlapping a PIR. The output is a dataframe that can be plotted
        """
    
        for _, tag in self.features_to_count.items():
            col_name = f"{tag}_count"
            
            # Create the dataframe
            self.pir_count_v_mean = pd.DataFrame(self.gene_counts.groupby([col_name])["GeneName_MeanExpression"].apply(list)).reset_index().explode("GeneName_MeanExpression")
        
            self.pir_count_v_mean[col_name] = self.pir_count_v_mean[col_name].apply(int)
            
            self.pir_count_v_mean[["Gene_Name", "Mean_Gene_Expression"]] = self.pir_count_v_mean["GeneName_MeanExpression"].str.split(" ", n=2, expand=True)
            
            self.pir_count_v_mean.drop("GeneName_MeanExpression",axis=1, inplace=True)
            
            self.pir_count_v_mean["Mean_Gene_Expression"] = self.pir_count_v_mean["Mean_Gene_Expression"].apply(float)
                        

In [64]:
ilc3_file = "../data/outputs/modified_chicago/ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm_modified.tsv"

cd4_ABC = "../data/outputs/modified_chicago/CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm_modified.tsv"

In [65]:
ilc3_file_dict = {"/Users/caz3so/workspaces/tacazares/pchic/data/peaks/RE/ILC3_RE.bed": "RE"}

cd4_file_dict = {"/Users/caz3so/workspaces/tacazares/pchic/data/peaks/RE/CD4_RE.bed": "RE"}

In [66]:
ilc_gene_expression = "/Users/caz3so/workspaces/tacazares/pchic/data/RNA/ILC3_mean_expression.tsv"
cd4_gene_expression = "/Users/caz3so/workspaces/tacazares/pchic/data/RNA/CD4_mean_expression.tsv"

In [67]:
out_dir = "../data/outputs"

In [68]:
# Import ILC3 data
ilc3 = ModExpressionAnalysis(ilc3_file, 
                        features_to_count=ilc3_file_dict,
                        gene_expression=ilc_gene_expression,
                        output_dir=out_dir,
                        output_basename="ILC_5kb_within_newbmap_CHiCAGO_ABC_peakm")

# Import cd4_abc file CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm.txt
cd4 = ModExpressionAnalysis(cd4_ABC, 
                        features_to_count=cd4_file_dict,
                        gene_expression=cd4_gene_expression,
                        output_dir=out_dir,
                        output_basename="CD4_1M_50K_5kb_within_newbmap_CHiCAGO_ABC_peakm")

Importing /Users/caz3so/workspaces/tacazares/pchic/data/peaks/RE/ILC3_RE.bed : Column will be saved as RE
Importing /Users/caz3so/workspaces/tacazares/pchic/data/peaks/RE/CD4_RE.bed : Column will be saved as RE


In [70]:
ilc3.gene_counts

Unnamed: 0,GeneName,Expression,RE_count,GeneName_MeanExpression
0,A1BG,474.000000,2.0,A1BG 474.0
1,A1CF,5.166667,0.0,A1CF 5.166666666666667
3,A2ML1,68.833333,1.0,A2ML1 68.83333333333333
4,A2MP1,4.833333,0.0,A2MP1 4.833333333333333
5,A4GALT,118.833333,2.0,A4GALT 118.83333333333331
...,...,...,...,...
22444,ZSWIM6,176.500000,2.0,ZSWIM6 176.5
22449,ZW10,563.500000,3.0,ZW10 563.5
22452,ZXDA,21.666667,0.0,ZXDA 21.666666666666668
22456,ZYG11B,465.500000,2.0,ZYG11B 465.5
