# BIRCH - Binding Site Search

### Stages of core algorithm

1. Peak search 
2. Generate theoretical binding sites
3. Match peaks to list
4. Filter list using rules

In [1]:
import pandas as pd
import numpy as np 
from scipy.signal import find_peaks 
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from os import listdir
from itertools import *

## Config Parameters

In [2]:
tolerance = 0.5 #0.5 Da

#subscript for formatting of chemistry strings
sub = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")

## Import Data

In [3]:
input_data_path = "..//Data//"
compound_list_path = input_data_path + "Compound Constraints//"
spectra_path = input_data_path + "Deconvoluted Spectra//"

#read in data
spectra_files = listdir(spectra_path)
bound_df = pd.read_excel(spectra_path+spectra_files[0])
unbound_df = pd.read_excel(spectra_path+spectra_files[1])

compounds_files = listdir(compound_list_path)
compounds_df = pd.read_excel(compound_list_path+compounds_files[1])

print("bound: ", spectra_files[0])
print("compounds: ", compounds_files[1])

bound:  Ubiquitin_plusC_1in100_000001.xlsx
compounds:  Compounds_CisOxTrans.xlsx


## Create Binding List

Purpose: Create theoretical list of possible combinations of drug bindings

Define chemical combinations as trees where:
<ol>
    <li> The root node is the primary reactant </li>
    <li> The nodes on the 1st level are the permutations of the secondary reactant as given by the min/max numbers </li>
    <li> The nodes on each level after that are the permutations of the other reactants as given by the min/max numbers </li>
</ol>

In [18]:
#create forest of trees 
attributes_list = ["Mass", "Min", "Max"]

#split into the 3 different types of reactants
primary_reactant_df = compounds_df[compounds_df["Primary"] == "Yes"]
secondary_reactant_df = compounds_df[compounds_df["Secondary"] == "Yes"]
other_reactants_df = compounds_df[(compounds_df["Primary"] != "Yes") & (compounds_df["Secondary"] != "Yes")]

#the number of trees in the forest correspond to the avaliability (max-min+1) of the primary reactant
number_of_trees = primary_reactant_df["Max"] - primary_reactant_df["Min"] + 1
forest = [[] for i in range(number_of_trees.values[0])]

#root nodes - primary reactant
for tree_idx in range(1,number_of_trees.values[0]+1):
    #perform list comprehension without creating a list
    root_id = str(tree_idx) + primary_reactant_df["Name"].values[0]
    forest.append({root_id: primary_reactant_df[attributes_list]})

#secondary reactant
for tree_idx in range(1, )