## Label distribution (kdeplot)
___

In [None]:
import pandas as pd
import numpy as np

PATHS = [
    'data/preprocessing/maoa_human_property_ic50.csv',
    'data/preprocessing/maoa_rat_property_ic50.csv',
]

LABELS = ['Human', 'Rat'] # for plot legends

dfs = [pd.read_csv(path) for path in PATHS]

In [None]:
dfs_preprocessed = []
for df in dfs:
    # Some preprocessing for ChEMBL data
    df = df[df['Standard Relation'] == "'='"]   # only exact IC50 values
    df = df[df['Standard Units'] == "nM"]       # only values in nM
    df['Standard Value'] = df['Standard Value'].apply(lambda x: x/10**6)    # convert nM -> M
    df['y'] = df['Standard Value'].apply(lambda x: -np.log10(x))

    # Remove extreme outliers (> 10*std)
    mu = df['y'].mean()
    std = df['y'].std()
    df['is_outlier'] = df['y'].apply(lambda x: x > (mu + 10*std) or x < (mu - 10*std))
    df = df[df['is_outlier'].apply(lambda x: not x)]    # remove outliers

    dfs_preprocessed.append(df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

X_AXIS_LABEL = 'pIC50'
PLOT_TITLE = 'Distribution of MAO-A pIC50'

sns.set_style('white')
sns.set_context('talk')

for df, label in zip(dfs_preprocessed, LABELS):
    sns.kdeplot(df['y'], label=label, fill=True)

plt.title(PLOT_TITLE)
plt.xlabel(X_AXIS_LABEL)
plt.legend()    # draw legend
plt.ylabel('')  # remove y-axis label
plt.yticks([])  # remove y-axis ticks

## Units distribution in ChEMBL data (barplot)
___

In [None]:
import pandas as pd
import pathlib

PATH = 'data/preprocessing/liver_human_liver_property_t1.2.csv'
df = pd.read_csv(PATH)

In [None]:
units = df['Standard Units'].value_counts().to_dict()

# if unit corresponds to less than 1% of total, group it into 'Other'

total = sum(units.values())
units = {k: v for k, v in units.items() if v / total >= 0.01}
units['Other'] = total - sum(units.values())
units = dict(sorted(units.items(), key=lambda item: item[1], reverse=True))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")
sns.set_context("talk")
sns.barplot(data=units, palette="viridis")

PLOT_TITLE = 'Human MAO-A'

# add counts on top of bars
for i, v in enumerate(units.values()):
    plt.text(i, v + 3, str(v), color='black', ha='center', fontsize=14)
plt.title(PLOT_TITLE)

## Min Tanimoto distance distribution (histplot)
___

In [None]:
import pandas as pd

PATH_1 = 'data/preprocessing/maoa_rat_property_ic50.csv'
PATH_2 = 'data/preprocessing/maoa_human_property_ic50.csv'

df1 = pd.read_csv(PATH_1)
df2 = pd.read_csv(PATH_2)

In [None]:
from src.data.tanimoto import TanimotoCalculator
from src.data.featurizer import EcfpFeaturizer

tc = TanimotoCalculator(featurizer=EcfpFeaturizer(n_bits=1024), smiles_list=df2['smiles'].tolist(), return_closest_smiles=True)
results = tc.run_batch(df1['smiles'].tolist())

# minimal tanimoto distance between each of the molecules in df1 and the whole df2 dataset
distances = results['min_distance']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

PLOT_TITLE = 'MAO-A rat'
X_AXIS_LABEL = 'min. Tanimoto dist to human data'

sns.set_style("white")
sns.set_context("talk")
sns.histplot(x=distances)

plt.title(PLOT_TITLE)
plt.xlabel(X_AXIS_LABEL)

plt.ylabel('')  # remove y axis label
plt.yticks([])  # remove y axis ticks