### Import Modules

In [1]:
# Jupyter Display
from IPython.core.display import display,HTML
display(HTML("<style>.container {width:85% !important;} </style>"))

# I/O tools
import os,sys
import gzip
import csv

# Standard Python Tools
import operator

# Custom functions
from utils import map_target_identifiers, flatten_list, view_target_dist

# Data handling modules
import numpy as np
import pandas as pd

# Sklearn Modules
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# Chemical Handling Modules
from rdkit import Chem
from rdkit.Chem import AllChem

# Vizualization Modules
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns

### Default Values

In [2]:
#Python Pandas print options
pd.set_option('display.width', 1000)
pd.set_option('precision', 3)
pd.set_option('max_rows', 15)
pd.set_option('large_repr', 'truncate')
pd.set_option('max_colwidth', 40)
pd.set_option('colheader_justify', 'left')


# Default Directories
BASE_DIR = os.getcwd()

# Default Files
CANCER_CPDS_F = os.path.join(BASE_DIR, 'cancer_compounds.sample.csv')
CHEMBL_MOLS_F = os.path.join(BASE_DIR, 'chembl_21_binding_molecules.csv.gz')
CHEMBL_TARGS_F = os.path.join(BASE_DIR, 'chembl_21_binding_targets.csv.gz')

### Primary Functions

In [3]:
def gen_compound_bitstring(smile, radius=?, nBits=1024):
    """Generates Morgan-fingerprint from compound smile, converts fingerprint to bit-vector, and returns bit-vector."""
    ? = Chem.MolFromSmiles(?)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, ?, ?)
    bstr = fp.ToBitString()
    return ?

In [4]:
#### THIS FUNCTION DOES NOT NEED TO BE ALTERED  ####
def predict_top_targets_for_compounds(cpd_f, classifier, targID_to_targName, ntop=10, verbose=True):
    """Load compounds of interest, convert to fp, get top N target predictions for each compound,
    and write to file."""
    base_dir = os.path.dirname(cpd_f)
    base_name = os.path.basename(cpd_f).split('.csv')[0]
    res_ofn = os.path.join(base_dir, base_name+'.predictions.csv')
    header = ['Zinc_ID', 'Targ_ID', 'Targ_Class', 'Probability_Estimate', 'Drug_Name', 'Smile']
    
    # Initiate csv reader and writer
    with open(cpd_f, 'r') as fi, open(res_ofn, 'w') as fo:
        reader = csv.reader(fi)
        reader.next()
        writer = csv.writer(fo)
        writer.writerow(header)
        
        # Iterate through cpd_f, generate compound bit-vectors
        for cpid, smile, drugname in reader:
            bstr = gen_compound_bitstring(smile)
            if bstr is None:
                continue
            temp = np.frombuffer(bstr, 'i1') - 48
            bitVec = temp.reshape(1, -1)
            
            # Generate array of probabilites for bitVec belonging to each target class
            probabilities = classifier.predict_proba(bitVec)
            target_probs = zip(classifier.classes_, probabilities[0])
            
            # Sort by most relevant targets, and write top N targets associated with bitVec to file
            top_targs = sorted(target_probs, key=operator.itemgetter(1), reverse=True)
            if verbose:
                toprint = [(targID_to_targName[targID], prob) for targID,prob in top_targs[0:ntop]]
                print('{}: '.format(cpid), toprint)
                print('')
            for targID, prob in top_targs[0:ntop]:
                writer.writerow([cpid, targID, targID_to_targName[targID], prob, drugname, smile])
    return res_ofn

In [5]:
def gen_naive_bayes_classifier(training_data, training_classes, Bernoulli=True):
    """Generate an instance of a Naive Bayesian classifier, based on either multivariate Bernoulli distributions or 
    multinomially distributed data. As our training_data consists of binomial input, the Bernoulli Model seems optimal."""
    if Bernoulli:
        print('Initiating Bernoulli Naive Bayesian Classifier')
        classifier = ?
        print('\tFitting classifier to training data...')
        classifier.fit(training_data, training_classes)
    else:
        print('Initiating Multinomial Naive Bayesian Classifier')
        classifier = ?
        print('\tFitting classifier to training data...')
        classifier.fit(?, ?)
    print('\tClassifier training complete!')
    print('')
    return classifier

In [20]:
#### THIS FUNCTION DOES NOT NEED TO BE ALTERED  ####
def format_training_data_for_naive_bayes(chembl_mol_f, chembl_targ_f):
    """Returns nDim array of bit-vectorized molecules (x-values) and corresponding nDim array of target classes
    that each bit-vector is assigned to (y-values)."""
    bayes_XY_fields = []
    cpid_to_bitVec = {} # cpid = compound chembl_ID
    
    print('Formatting training data for use with Naive Bayesian Classifier...')
    mol_fi, targ_fi = (gzip.open(chembl_mol_f, 'rt'), gzip.open(chembl_targ_f, 'rt'))
    # Iterate through mol_file, transform smile to bit-vector, map cpid to its corresponding bit-vector
    mol_reader = csv.reader(mol_fi)
    next(mol_reader) # circumvent file header
    print('\tTransforming cpd smiles to bit-vectors from file: {}'.format(os.path.basename(chembl_mol_f)))
    for cpid, smi, fp in mol_reader:
        bitVec = np.frombuffer(fp.encode(), 'i1') - 48 # creates numpy array from string of 0's and 1's
        cpid_to_bitVec[cpid] = bitVec
    print('\t\tGenerated {} compound bit-vectors'.format(len(cpid_to_bitVec)))

    # Iterate through targ_file, get all compounds (bitVector) associated with each target, format for bayes learning.
    targ_reader = csv.reader(targ_fi)
    next(targ_reader)
    print("\tMapping target classes to compound bit-vectors from file: {}".format(os.path.basename(chembl_targ_f)))
    for targID, uniprotID, cpd_assocs, targ_desc in targ_reader:
        for cpid in cpd_assocs.split(':'):
            if cpid in cpid_to_bitVec:
                bitVec = cpid_to_bitVec[cpid]
                bayes_XY_fields.append((bitVec, targID))
    
    # Return xvalues (bit-Vectors for each compound), and yvalues (class label of target each compound is associated with)
    xvals, yvals = zip(*bayes_XY_fields)
    print('\t\tGenerated {} total training examples.'.format(len(xvals)))
    print('')
    return xvals, yvals

In [7]:
def predict_targets(cancer_cpds_f, chembl_mol_f, chembl_targ_f, ntop=5, Bernoulli=True, verbose=False):
    """Maps targets, formats data, trains a naive bayesian classifier, then predicts and plots top targets, 
    based on cancer-related compounds file provided"""
    chid_to_targName = map_target_identifiers(chembl_targ_f)
    training_data, training_classes = format_training_data_for_naive_bayes(chembl_mol_f, ?)
    classifier = gen_naive_bayes_classifier(?, training_classes, Bernoulli=?)
    preds_f = predict_top_targets_for_compounds(?, ?, chid_to_targName, ntop=ntop, verbose=verbose)
    view_target_dist(?, figsize=(18,12))
    return preds_f

Fix the above functions so that our handler function below will work.

**Note_1:** Not all the functions need to be adjusted. Those that work as is, have a comment above mentioning the function does not need to be altered.

**Note_2:** You do not have to alter the functions in Jupyter. It's just a convenient environment for testing. The actual script.py have command line interfaces so that it is easier for you to run the scripts on multiple files. Those command line arguments (argparse) will not work in Jupyter.

**Note_3:** The Jupyter notebook stores all variables created in memory unless explicitely deleted. Thus if you name a variable something and change the name in the same cell, the original variable will STILL be there. This can cause problems if you forget to change all instances of the initial variable later in your script. The easiest way to not worry about this is to restart the kernel, which will flush the memory. However you will have to reload every cell again.

**Note_4:** Ask questions!

In [None]:
preds_f = predict_targets(CANCER_CPDS_F, CHEMBL_MOLS_F, CHEMBL_TARGS_F, ntop=?, Bernoulli=?, verbose=True)

Mapping target chembl_IDs to human-readable Names...
	Mapped 2240 chembl_IDs

Formatting training data for use with Naive Bayesian Classifier...
	Transforming cpd smiles to bit-vectors from file: chembl_21_binding_molecules.csv.gz
		Generated 334291 compound bit-vectors
	Mapping target classes to compound bit-vectors from file: chembl_21_binding_targets.csv.gz
		Generated 1372818 total training examples.

Initiating Bernoulli Naive Bayesian Classifier
	Fitting classifier to training data...
