From 680b14d3fcdc85d9f67ba9b8d2831196c7649fd9 Mon Sep 17 00:00:00 2001 From: Seth Axen Date: Sat, 4 Nov 2017 03:19:57 -0700 Subject: [PATCH] Add documentation for fingerprints --- doc/source/usage/fingerprints/algebra.rst | 0 doc/source/usage/fingerprints/comparison.rst | 58 ++++++ doc/source/usage/fingerprints/fprints.rst | 195 +++++++++++++++++++ doc/source/usage/fingerprints/index.rst | 12 ++ doc/source/usage/fingerprints/storage.rst | 109 ++++++++++- 5 files changed, 373 insertions(+), 1 deletion(-) delete mode 100644 doc/source/usage/fingerprints/algebra.rst create mode 100644 doc/source/usage/fingerprints/fprints.rst diff --git a/doc/source/usage/fingerprints/algebra.rst b/doc/source/usage/fingerprints/algebra.rst deleted file mode 100644 index e69de29..0000000 diff --git a/doc/source/usage/fingerprints/comparison.rst b/doc/source/usage/fingerprints/comparison.rst index e69de29..3cb8800 100644 --- a/doc/source/usage/fingerprints/comparison.rst +++ b/doc/source/usage/fingerprints/comparison.rst @@ -0,0 +1,58 @@ +Fingerprint Comparison +====================== + +The `e3fp.fingerprint.metrics` sub-package provides several useful methods for +batch comparison of fingerprints in various representations. + +Fingerprint Metrics +------------------- + +These metrics operate directly on pairs of :py:class:`.Fingerprint` and +:py:class:`.FingerprintDatabase` objects or on a combination of each. If +only a single variable is specified, self-comparison is performed. The +implemented methods are common functions for fingerprint similarity in the +literature. + +.. todo:: + + Document examples + +Array Metrics +------------- + +To efficiently compare fingerprint databases above, we provide comparison +metrics that can operate directly on the internal sparse matrix representation +without the need to "densify it". We describe these here, as they have several +additional features. + +The array metrics implemented in `e3fp.fingerprint.metrics.array_metrics` are +implemented such that they may take any combination of dense and sparse inputs. +Additionally, they are designed to function as +`scikit-learn-compatible kernels `_ +for machine learning tasks. For example, one might perform an analysis using a +support vector machine (SVM) and Tanimoto kernel. + +.. code:: python + + >>> from sklearn.svm import SVC + >>> from e3fp.fingerprint.metrics.array_metrics import tanimoto + >>> clf = SVC(kernel=tanimoto) + >>> clf.fit(X, y) + ... + >>> clf.predict(test) + ... + +Most common fingerprint comparison metrics only apply to binary fingerprints. +We include several that operate equally well on count- and float-based +fingerprints. For example, to our knowledge, we provide the only open source +implementation of Soergel similarity, the analog to the Tanimoto coefficient +for non-binary fingerprints that can efficiently operate on sparse inputs. + +.. code:: python + + >>> from e3fp.fingerprint.metrics.array_metrics import soergel + >>> clf = SVC(kernel=soergel) + >>> clf.fit(X, y) + ... + >>> clf.predict(test) + ... diff --git a/doc/source/usage/fingerprints/fprints.rst b/doc/source/usage/fingerprints/fprints.rst new file mode 100644 index 0000000..07cb2c6 --- /dev/null +++ b/doc/source/usage/fingerprints/fprints.rst @@ -0,0 +1,195 @@ +Fingerprints +============ + +The simplest interface for molecular fingerprints are through three classes in +`e3fp.fingerprint.fprint`: + +:py:class:`.Fingerprint` + a fingerprint with "on" bits + +:py:class:`.CountFingerprint` + a fingerprint with counts for each "on" bit + +:py:class:`.FloatFingerprint` + a fingerprint with float values for each "on" bit, generated for example by + averaging conformer fingerprints. + +In addition to storing "on" indices and, for the latter two, corresponding +values, they store fingerprint properties, such as name, level, and any +arbitrary property. They also provide simple interfaces for fingerprint +comparison, some basic processing, and comparison. + +.. note:: Many of these operations are more efficient when operating on a + :py:class:`.FingerprintDatabase`. See :ref:`Fingerprint Storage` for more + information. + +In the below examples, we will focus on :py:class:`.Fingerprint` and +:py:class:`.CountFingerprint`. First, we execute the necessary imports. + +.. testsetup:: + + import numpy as np + np.random.seed(0) + +.. doctest:: + + >>> from e3fp.fingerprint.fprint import Fingerprint, CountFingerprint + >>> import numpy as np + +.. seealso:: + + :ref:`Fingerprint Storage`, :ref:`Fingerprint Comparison` + +Creation and Conversion +----------------------- + +Here we create a bit-fingerprint with random "on" indices. + + >>> bits = 2**32 + >>> indices = np.sort(np.random.randint(0, bits, 30)) + >>> indices + array([ 243580376, 305097549, ..., 3975407269, 4138900056]) + >>> fp1 = Fingerprint(indices, bits=bits, level=0) + >>> fp1 + Fingerprint(indices=array([243580376, ..., 4138900056]), level=0, bits=4294967296, name=None) + +This fingerprint is extremely sparse + + >>> fp1.bit_count + 30 + >>> fp1.density + 6.984919309616089e-09 + +We can therefore "fold" the fingerprint through a series of bitwise "OR" +operations on halves of the sparse vector until it is of a specified length, +with minimal collision of bits. + + >>> fp_folded = fp1.fold(1024) + >>> fp_folded + Fingerprint(indices=array([9, 70, ..., 845, 849]), level=0, bits=1024, name=None) + >>> fp_folded.bit_count + 29 + >>> fp_folded.density + 0.0283203125 + +A :py:class:`.CountFingerprint` may be created by also providing a dictionary +matching indices with nonzero counts to the counts. + + >>> indices2 = np.sort(np.random.randint(0, bits, 60)) + >>> counts = dict(zip(indices2, np.random.randint(1, 10, indices2.size))) + >>> counts + {80701568: 8, 580757632: 7, ..., 800291326: 5, 4057322111: 7} + >>> cfp1 = CountFingerprint(counts=counts, bits=bits, level=0) + >>> cfp1 + CountFingerprint(counts={80701568: 8, 580757632: 7, ..., 3342157822: 2, 4057322111: 7}, level=0, bits=4294967296, name=None) + +Unlike folding a bit fingerprint, by default, folding a count fingerprint +performs a "SUM" operation on colliding counts. + + >>> cfp1.bit_count + 60 + >>> cfp_folded = cfp1.fold(1024) + >>> cfp_folded + CountFingerprint(counts={128: 15, 257: 4, ..., 1022: 2, 639: 7}, level=0, bits=1024, name=None) + >>> cfp_folded.bit_count + 57 + +It is trivial to interconvert the fingerprints. + + >>> cfp_folded2 = CountFingerprint.from_fingerprint(fp_folded) + >>> cfp_folded2 + CountFingerprint(counts={9: 1, 87: 1, ..., 629: 1, 763: 1}, level=0, bits=1024, name=None) + >>> cfp_folded2.indices[:5] + array([ 9, 70, 72, 87, 174]) + >>> fp_folded.indices[:5] + array([ 9, 70, 72, 87, 174]) + +RDKit Morgan fingerprints (analogous to ECFP) may easily be converted to a +:py:class:`.Fingerprint`. + + >>> from rdkit import Chem + >>> from rdkit.Chem import AllChem + >>> mol = Chem.MolFromSmiles('Cc1ccccc1') + >>> mfp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) + >>> mfp + + >>> Fingerprint.from_rdkit(mfp) + Fingerprint(indices=array([389, 1055, ..., 1873, 1920]), level=-1, bits=2048, name=None) + +Likewise, :py:class:`.Fingerprint` can be easily converted to a NumPy ndarray or +SciPy sparse matrix. + + >>> fp_folded.to_vector() + <1x1024 sparse matrix of type '' + ...with 29 stored elements in Compressed Sparse Row format> + >>> fp_folded.to_vector(sparse=False) + array([False, False, False, ..., False, False, False], dtype=bool) + >>> np.where(fp_folded.to_vector(sparse=False))[0] + array([ 9, 70, 72, 87, ...]) + >>> cfp_folded.to_vector(sparse=False) + array([0, 0, 0, ..., 0, 2, 0], dtype=uint16) + >>> cfp_folded.to_vector(sparse=False).sum() + 252 + +Algebra +------- + +Basic algebraic functions may be performed on fingerprints. If either +fingerprint is a bit fingerprint, all algebraic functions are bit-wise. +The following bit-wise operations are supported: + +Equality + >>> fp1 = Fingerprint([0, 1, 6, 8, 12], bits=16) + >>> fp2 = Fingerprint([1, 2, 4, 8, 11, 12], bits=16) + >>> fp1 == fp2 + False + >>> fp1_copy = Fingerprint.from_fingerprint(fp1) + >>> fp1 == fp1_copy + True + >>> fp1_copy.level = 5 + >>> fp1 == fp1_copy + False + +Union/OR + >>> fp1 + fp2 + Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None) + >>> fp1 | fp2 + Fingerprint(indices=array([0, 1, 2, 4, 6, 8, 11, 12]), level=-1, bits=16, name=None) + +Intersection/AND + >>> fp1 & fp2 + Fingerprint(indices=array([1, 8, 12]), level=-1, bits=16, name=None) + +Difference/AND NOT + >>> fp1 - fp2 + Fingerprint(indices=array([0, 6]), level=-1, bits=16, name=None) + >>> fp2 - fp1 + Fingerprint(indices=array([2, 4, 11]), level=-1, bits=16, name=None) + +XOR + >>> fp1 ^ fp2 + Fingerprint(indices=array([0, 2, 4, 6, 11]), level=-1, bits=16, name=None) + +With count or float fingerprints, bit-wise operations are still possible, but +algebraic operations are applied to counts. + + >>> fp1 = CountFingerprint(counts={0: 3, 1: 2, 5: 1, 9: 3}, bits=16) + >>> fp2 = CountFingerprint(counts={1: 2, 5: 2, 7: 3, 10: 7}, bits=16) + >>> fp1 + fp2 + CountFingerprint(counts={0: 3, 1: 4, 5: 3, 7: 3, 9: 3, 10: 7}, level=-1, bits=16, name=None) + >>> fp1 - fp2 + CountFingerprint(counts={0: 3, 1: 0, 5: -1, 7: -3, 9: 3, 10: -7}, level=-1, bits=16, name=None) + >>> fp1 * 3 + CountFingerprint(counts={0: 9, 1: 6, 5: 3, 9: 9}, level=-1, bits=16, name=None) + >>> fp1 / 2 + FloatFingerprint(counts={0: 1.5, 1: 1.0, 5: 0.5, 9: 1.5}, level=-1, bits=16, name=None) + +Finally, fingerprints may be batch added and averaged, producing either a count +or float fingerprint when sensible. + + >>> from e3fp.fingerprint.fprint import add, mean + >>> fps = [Fingerprint(np.random.randint(0, 32, 8), bits=32) for i in range(100)] + >>> add(fps) + CountFingerprint(counts={0: 23, 1: 23, ..., 30: 20, 31: 14}, level=-1, bits=32, name=None) + >>> mean(fps) + FloatFingerprint(counts={0: 0.23, 1: 0.23, ..., 30: 0.2, 31: 0.14}, level=-1, bits=32, name=None) diff --git a/doc/source/usage/fingerprints/index.rst b/doc/source/usage/fingerprints/index.rst index e69de29..53dc10f 100644 --- a/doc/source/usage/fingerprints/index.rst +++ b/doc/source/usage/fingerprints/index.rst @@ -0,0 +1,12 @@ +Using Fingerprints +================== + +While molecular fingerprints are widely used, few packages provide simple +interfaces for working with them and interfacing with machine learning +packages. E3FP provides a number of general utility classes and methods for +doing precisely this. + +.. toctree:: + fprints + storage + comparison diff --git a/doc/source/usage/fingerprints/storage.rst b/doc/source/usage/fingerprints/storage.rst index cd4cafe..8a13b39 100644 --- a/doc/source/usage/fingerprints/storage.rst +++ b/doc/source/usage/fingerprints/storage.rst @@ -1,2 +1,109 @@ Fingerprint Storage -=================== \ No newline at end of file +=================== + +The most efficient way to store and interact with fingerprints is through the +`e3fp.fingerprint.db.FingerprintDatabase` class. This class wraps a matrix with +sparse rows (`scipy.sparse.csr_matrix`), where each row is a fingerprint. This +enables rapid I/O of the database while also minimizing the memory footprint. +Accessing the underlying sparse representation with the +:ref:`.FingerprintDatabase.array` attribute is convenient for machine learning +purposes, while the database class itself provides several useful functions. + +.. note:: + + We strongly recommend upgrading to at least SciPy v1.0.0 when working with + large fingerprint databases, as old versions are much slower and have + several bugs for database loading. + + +Database I/O and Indexing +------------------------- + +See the full `e3fp.fingerprint.db.FingerprintDatabase` documentation for a +description of basic database usage, attributes, and methods. Below, several +additional use cases are documented. + +Batch Database Operations +------------------------- + +Due to the sparse representation of the underlying data structure, an un- +folded database, a database with unfolded fingerprints does not use +significantly more disk space than a database with folded fingerprints. However, +it is usually necessary to fold fingerprints for machine learning tasks. The +:py:class:`.FingerprintDatabase` does this very quickly. + +.. testsetup:: + + import numpy as np + np.random.seed(3) + +.. doctest:: + + >>> from e3fp.fingerprint.db import FingerprintDatabase + >>> from e3fp.fingerprint.fprint import Fingerprint + >>> import numpy as np + >>> db = FingerprintDatabase(fp_type=Fingerprint, name="TestDB") + >>> print(db) + FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: None, fp_num: 0] + >>> on_inds = [np.random.uniform(0, 2**32, size=30) for i in range(5)] + >>> fps = [Fingerprint(x, bits=2**32) for x in on_inds] + >>> db.add_fingerprints(fps) + >>> print(db) + FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5] + >>> db.get_density() + 6.984919309616089e-09 + >>> fold_db = db.fold(1024) + >>> print(fold_db) + FingerprintDatabase[name: TestDB, fp_type: Fingerprint, level: -1, bits: 1024, fp_num: 5] + >>> fold_db.get_density() + 0.0287109375 + +A database can be converted to a different fingerprint type: + + >>> from e3fp.fingerprint.fprint import CountFingerprint + >>> count_db = db.as_type(CountFingerprint) + >>> print(count_db) + FingerprintDatabase[name: TestDB, fp_type: CountFingerprint, level: -1, bits: 4294967296, fp_num: 5] + >>> count_db[0] + CountFingerprint(counts={2977004690: 1, ..., 3041471738: 1}, level=-1, bits=4294967296, name=None) + +The `e3fp.fingerprint.db.concat` method allows efficient joining of multiple +databases. + + >>> from e3fp.fingerprint.db import concat + >>> dbs = [] + >>> for i in range(10): + ... db = FingerprintDatabase(fp_type=Fingerprint) + ... on_inds = [np.random.uniform(0, 1024, size=30) for j in range(5)] + ... fps = [Fingerprint(x, bits=2**32, name="Mol{}".format(i)) for x in on_inds] + ... db.add_fingerprints(fps) + ... dbs.append(db) + >>> dbs[0][0] + Fingerprint(indices=array([94, 97, ..., 988, 994]), level=-1, bits=4294967296, name=Mol0) + >>> print(dbs[0]) + FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 5] + >>> merge_db = concat(dbs) + >>> print(merge_db) + FingerprintDatabase[name: None, fp_type: Fingerprint, level: -1, bits: 4294967296, fp_num: 50] + +Database Comparison +------------------- + +Two databases may be compared using various metrics in +`e3fp.fingerprint.metrics`. Additionally, all fingerprints in a database may be +compared to each other simply by only providing a single database. +See :ref:`Fingerprint Comparison` for more details. + +Performing Machine Learning on the Database +------------------------------------------- + +The underlying sparse matrix may be passed directly to machine learning tools +in any package that is compatible with SciPy sparse matrices, such as +`scikit-learn `_. + + >>> from sklearn.naive_bayes import BernoulliNB + >>> clf = BernoulliNB() + >>> clf.fit(db.array, ypred) # doctest: +SKIP + BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) + >>> clf.predict(db2.array) # doctest: +SKIP + ...