# Had to clear all output for file size reasons.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import subprocess
import os
import random
import math
import time
import plotly.express as px

import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdmolops
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.ML.Cluster import Butina

# shut off warnings
from rdkit import RDLogger                                                                                                                                                               
RDLogger.DisableLog('rdApp.*')  

import matplotlib
from matplotlib import rc
font = {'size'   : 8}
matplotlib.rc('font', **font)

# change font
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score

# Part 1: Define functions to make PCA & tSNE

In [None]:
def pca_tsne_mols(fp_list, fp_labels, colors_for_vis, num_components = 2048):
    # PCA first
    pca = PCA(n_components=2)
    crds = pca.fit_transform(fp_list)

    print('variance explained by pc1+pc2: ' + str(np.sum(pca.explained_variance_ratio_)))

    crds_df = pd.DataFrame(crds,columns=["PC_1","PC_2"])
    crds_df['label'] = fp_labels
    crds_df.head()

    plt.figure(figsize=(5,5), dpi = 300)
    ax = sns.scatterplot(data=crds_df,x="PC_1",y="PC_2",hue="label", alpha = 0.7, s = 10, palette=colors_for_vis)
    
    # TSNE next
    pca = PCA(n_components=num_components)
    crds = pca.fit_transform(fp_list)

    crds_embedded = TSNE(n_components=2).fit_transform(crds)

    tsne_df = pd.DataFrame(crds_embedded,columns=["X","Y"])
    tsne_df['label'] = fp_labels

    plt.figure(figsize=(5,5), dpi = 300)
    ax = sns.scatterplot(data=tsne_df,x="X",y="Y",hue="label", alpha = 0.7,  s = 10, palette=colors_for_vis)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.show()
    
    return(tsne_df)

# Part 2: Read in and process all possible datasets for comparisons

In [None]:
# take in ians hits csv - - ONLY ORIGINAL HITS NOT BROAD800K
df = pd.read_excel('../data/TrainingDataRound1_wValidation.xlsx')
df = df[[type(x) != float for x in list(df['SMILES'])]]
print('num drh drugs: ' + str(len(df)))
df = df[['SMILES', 'Name']]

drh_smiles = list(df['SMILES'])
drh_mols = [Chem.MolFromSmiles(x) for x in drh_smiles]
drh_fps, drh_names = zip(*[(Chem.RDKFingerprint(x),y) for x,y in zip(drh_mols,list(df['Name'])) if x is not None])

# get morgan fingerprints for abx
abx = pd.read_csv('../data/curated_set_of_566_abx.csv')
abx = abx[[type(x) != float for x in list(abx['Smiles'])]]
abx_smiles = list(abx['Smiles'])
mols = [Chem.MolFromSmiles(x) for x in abx_smiles]
abx_fps, abx_names, abx_classes = zip(*[(Chem.RDKFingerprint(x),y,z) for x,y,z in zip(mols,list(abx['Name']),list(abx['Class-Annotated'])) if x is not None])

# read in all broad - takes a while due to all 800K
broad = pd.read_csv('../data/broad800k.csv') # the clean one of just smiles
broad_smiles = list(broad['smiles'])
broad_mols = [Chem.MolFromSmiles(x) for x in broad_smiles]
broad_fps = [Chem.RDKFingerprint(x) for x in broad_mols if x is not None]
broad

In [None]:
fp_list = list(broad_fps)
fp_list.extend(abx_fps)
fp_list.extend(drh_fps)

fp_labels = ['Broad Institute 800K'] * len(broad_fps)
fp_labels.extend(['Known Antibiotics'] * len(abx_fps))
fp_labels.extend(['DRH'] * len(drh_fps))

colors_for_vis = ['lightgray','green','steelblue']
tsne_df = pca_tsne_mols(fp_list, fp_labels, colors_for_vis)

# Part 3: Get additional information for interactive display

In [None]:
name_list = list(broad_smiles)
name_list.extend(abx_names)
name_list.extend(drh_names)

smis_list = list(broad_smiles)
smis_list.extend(abx_smiles)
smis_list.extend(drh_smiles)

testdf = tsne_df
testdf['Name'] = name_list
testdf['SMILES'] = smis_list

# clean up display so only 2 dec
abx = abx[['Smiles', 'Class-Annotated']]
abx = abx.drop_duplicates('Smiles')
testdf = testdf.merge(abx, left_on = 'SMILES', right_on = 'Smiles', how = 'left')
testdf = testdf.fillna('N/A')

In [None]:
# actually make the plot
hover_labels = ['X', 'Y', 'label', 'Name', 'SMILES', 'Class-Annotated']
hover_bools = [True, True, True, True, True, True]

fig = px.scatter(testdf, x = 'X', y = 'Y', opacity = 0.75, color =  testdf['label'], hover_data = dict(zip(hover_labels, hover_bools)), color_discrete_sequence=colors_for_vis)
fig.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})
fig.show()

In [None]:
# and save it
testdf.to_csv('../out/drh_vs_broad800k_tsne_data_plus_annotated_mechanism_plus_val_ML_mols.csv', index = False)
fig.write_image('../figure_panels/S2A_drh_vs_broad800k_tsne_data_plus_annotated_mechanism_plus_val_ML_mols.png') # had to conda install -c conda-forge python-kaleido
testdf

In [None]:
# my edits changing the color + the order of display did not save
# made a dummy column called dummy_label where 0 = broad800k, 1 = drh, 2 = abx using list comprehension https://stackoverflow.com/questions/9987483/elif-in-list-comprehension-conditionals
# sorted on it and then displayed the test_df as above