Merge pull request #4 from hammerlab/seq-align

Integrate seq-align into topeology
hammerlab · Feb 5, 2016 · 598b8f8 · 598b8f8
2 parents 3f4b140 + da5e992
commit 598b8f8
Show file tree

Hide file tree

Showing 9 changed files with 515 additions and 101 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -11,7 +11,10 @@ before_install:
   - export PATH=/home/travis/miniconda2/bin:$PATH
 install:
   - conda update --yes conda
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy nose pandas matplotlib scikit-bio
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy nose pandas matplotlib scikit-bio libgfortran
+  - git clone --recursive https://github.com/noporpoise/seq-align
+  - (cd seq-align && make)
+  - export SEQ_ALIGN_PATH=$PWD/seq-align
   - pip install .
   - pip install coveralls
   - pip install pypandoc

diff --git a/README.md b/README.md
@@ -35,12 +35,20 @@ Output looks like:
 
 ## Installation
 
-You can install topeology using [pip](https://pip.pypa.io/en/latest/quickstart.html):
+You can install topeology using [pip]:
 
 ```sh
 pip install topeology
 ```
 
+Currently, topeology use [seq-align] to quickly compare sequences, wrapped in a C extension. It will be
+installed if [seq-align] is installed; otherwise, topeology reverts to using another scorer.
+
+To install topeology with this extension:
+- Follow [seq-align]'s installation instructions, and then set `SEQ_ALIGN_PATH` to the installation
+  directory.
+- Install topeology via [pip]. If topeology is already installed, run `pip install topeology --upgrade --no-deps --force-reinstall`.
+
 ## Methodology
 
 Topeology uses Smith-Waterman alignment to align each neoepitope with each IEDB epitope of the
@@ -49,5 +57,8 @@ amino acid are considered.
 
 This software uses the following libraries for Smith-Waterman alignment:
 
-- https://github.com/noporpoise/seq-align
-- https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library
+- [seq-align]
+- [Complete-Striped-Smith-Waterman-Library](https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library)
+
+[seq-align]: https://github.com/noporpoise/seq-align
+[pip]: https://pip.pypa.io/en/latest/quickstart.html
diff --git a/pmbec_align.c b/pmbec_align.c
@@ -0,0 +1,286 @@
+#include <Python.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <ctype.h>
+#include "string_buffer/string_buffer.h"
+#include "smith_waterman.h"
+#include <zlib.h>
+#include "alignment_macros.h"
+#include "alignment_scoring.h"
+#include "alignment_scoring_load.h"
+
+// Copied from seq-align source
+void add_mutations(scoring_t* scoring, const char *str, int *scores,
+                   char use_match_mismatch) {
+  size_t i, j, len = strlen(str);
+  char a, b;
+  int score;
+
+  for (i = 0; i < len; i++) {
+    a = scoring->case_sensitive ? str[i] : tolower(str[i]);
+    for (j = 0; j < len; j++) {
+      b = scoring->case_sensitive ? str[j] : tolower(str[j]);
+      score = ARR_LOOKUP(scores, len, i, j);
+
+      scoring_add_mutation(scoring, a, b, score);
+    }
+  }
+
+  scoring->use_match_mismatch = use_match_mismatch;
+}
+
+static char AMINO_ACIDS[] = "ARNDCQEGHILKMFPSTWYVBZX*";
+static scoring_t scoring;
+static bool scoring_initialized = false;
+
+static char *
+string_from_pyobject(PyObject *obj) {
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_AsUTF8(obj);
+#else
+  return PyBytes_AsString(obj);
+#endif
+}
+
+// Copied from http://stackoverflow.com/a/13942236
+static PyObject *
+make_list(double array[], size_t size) {
+  PyObject *l = PyList_New(size);
+  size_t i;
+  for (i = 0; i != size; ++i) {
+    PyList_SET_ITEM(l, i, PyFloat_FromDouble(array[i]));
+  }
+  return l;
+}
+
+static bool is_amino_acids(char *seq) {
+  size_t len_seq = strlen(seq);
+  int i;
+  for (i = 0; i < len_seq; i++) {
+    // Is this letter is an amino acid?
+    if (memchr(AMINO_ACIDS, toupper(seq[i]), sizeof(AMINO_ACIDS)) == NULL) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int pmbec_score_int(scoring_t *scoring, char *seq_a, char *seq_b) {
+  sw_aligner_t *sw = smith_waterman_new();
+  alignment_t *result = alignment_create(256);
+
+  smith_waterman_align(seq_a, seq_b, scoring, sw);
+
+  int max_score = INT_MIN;
+  while(smith_waterman_fetch(sw, result)) {
+    if(result->score > max_score) {
+      max_score = result->score;
+    }
+  }
+
+  smith_waterman_free(sw);
+  alignment_free(result);
+
+  return max_score;
+}
+
+static PyObject *
+pmbec_norm_score(PyObject *self, PyObject *args) {
+  if (!scoring_initialized) {
+    PyErr_SetString(PyExc_ValueError, "pmbec_init must be called before aligning sequences");
+    return NULL;
+  }
+
+  char *seq_a, *seq_b;
+  if (!PyArg_ParseTuple(args, "ss", &seq_a, &seq_b))
+    return NULL;
+
+  if (strlen(seq_a) != strlen(seq_b)) {
+    PyErr_SetString(PyExc_ValueError, "Sequence lengths must be equal");
+    return NULL;
+  }
+
+  if (!is_amino_acids(seq_a) || !is_amino_acids(seq_b)) {
+    PyErr_SetString(PyExc_ValueError, "Sequences must only comprise amino acids");
+    return NULL;
+  }
+
+  int val = pmbec_score_int(&scoring, seq_a, seq_b);
+
+  if (val < 0) {
+    return Py_None;
+  }
+
+  float val_norm = ((float) val / 100.0) / (float) strlen(seq_a);
+  return PyFloat_FromDouble(val_norm);
+}
+
+static PyObject *
+pmbec_score(PyObject *self, PyObject *args) {
+  if (!scoring_initialized) {
+    PyErr_SetString(PyExc_ValueError, "pmbec_init must be called before aligning sequences");
+    return NULL;
+  }
+
+  char *seq_a, *seq_b;
+  if (!PyArg_ParseTuple(args, "ss", &seq_a, &seq_b))
+    return NULL;
+
+  if (!is_amino_acids(seq_a) || !is_amino_acids(seq_b)) {
+    PyErr_SetString(PyExc_ValueError, "Sequences must only comprise amino acids");
+    return NULL;
+  }
+
+  int val = pmbec_score_int(&scoring, seq_a, seq_b);
+
+  if (val < 0) {
+    return Py_None;
+  }
+
+  return PyFloat_FromDouble((float) val / 100.0);
+}
+
+static PyObject *
+pmbec_score_multiple(PyObject *self, PyObject *args) {
+  PyObject *array_seq_a, *array_seq_b;
+  char **seq_a_values, **seq_b_values;
+  double *output;
+  int num_seqs_a, num_seqs_b;
+  int i;
+
+  if (!PyArg_ParseTuple(args, "O!O!", &PyList_Type, &array_seq_a,
+                        &PyList_Type, &array_seq_b)) {
+    return NULL;
+  }
+
+  num_seqs_a = PyList_Size(array_seq_a);
+  num_seqs_b = PyList_Size(array_seq_b);
+  if (num_seqs_a != num_seqs_b) {
+    PyErr_SetString(PyExc_ValueError, "...");
+    return NULL;
+  }
+
+  seq_a_values = calloc(num_seqs_a, sizeof(char*));
+  seq_b_values = calloc(num_seqs_b, sizeof(char*));
+  for (i = 0; i < num_seqs_a; i++) {
+    seq_a_values[i] = (char*) string_from_pyobject(PyList_GetItem(array_seq_a, i));
+    seq_b_values[i] = (char*) string_from_pyobject(PyList_GetItem(array_seq_b, i));
+  }
+
+  output = calloc(num_seqs_b, sizeof(double));
+  for (i = 0; i < num_seqs_a; i++) {
+    output[i] = pmbec_score_int(&scoring, seq_a_values[i], seq_b_values[i]) / 100.0;
+  }
+
+  free(seq_a_values);
+  free(seq_b_values);
+
+  PyObject* output_list = make_list(output, num_seqs_a);
+
+  free(output);
+
+  return output_list;
+}
+
+static PyObject *
+pmbec_init(PyObject *self, PyObject *args) {
+  PyObject *array;
+  int *pmbec_values;
+  int pmbec_count, gap_extend, i;
+
+  if (!PyArg_ParseTuple(args, "iO!", &gap_extend, &PyList_Type, &array)) {
+    return NULL;
+  }
+
+  // Accept only a positive value for the gap extension
+  if (gap_extend < 0) {
+    PyErr_SetString(PyExc_ValueError, "Gap extension penalty must be >= 0");
+    return NULL;
+  }
+
+  pmbec_count = PyList_Size(array);
+  if (pmbec_count != 576) {
+    PyErr_SetString(PyExc_ValueError, "Substitution matrices must be of length 576");
+    return NULL;
+  }
+
+  pmbec_values = calloc(pmbec_count, sizeof(int));
+  for (i = 0; i < pmbec_count; i++) {
+    pmbec_values[i] = (int) PyLong_AsLong(PyList_GetItem(array, i));
+  }
+
+  // Default match/mismatch scores; ignorable because matrix covers all scores
+  int match = -100;
+  int mismatch = -100;
+
+  // Don't ever use the defaults specified above, since the matrix covers all scores
+  char use_match_mismatch = 0;
+
+  // Gap penalty = gap_open + gap_extend * length of gap, so we'll just rely on gap_extend
+  // without any opening-specific penalties
+  gap_extend = gap_extend * -1; // seq-align expects negative gap penalties
+  int gap_open = 0;
+
+  // No special treatment for gaps at the start
+  char no_start_gap_penalty = 0;
+  char no_end_gap_penalty = 0;
+
+  // No forcing of gaplessness in either peptide
+  char no_gaps_in_a = 0;
+  char no_gaps_in_b = 0;
+
+  // Substitutions are allowed
+  char no_mismatches = 0;
+
+  char case_sensitive = 0;
+
+  scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
+               no_start_gap_penalty, no_end_gap_penalty,
+               no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);
+
+  add_mutations(&scoring, AMINO_ACIDS, pmbec_values, use_match_mismatch);
+
+  scoring_initialized = true;
+
+  free(pmbec_values);
+
+  return Py_None;
+}
+
+static PyMethodDef PmbecAlignMethods[] = {
+  {"pmbec_init", pmbec_init, METH_VARARGS,
+   "Initialize Smith-Waterman alignment scoring using the PMBEC matrix."},
+  {"pmbec_score", pmbec_score, METH_VARARGS,
+   "Calculate the Smith-Waterman alignment score of two sequences using the PMBEC matrix."},
+  {"pmbec_score_multiple", pmbec_score_multiple, METH_VARARGS,
+   "Calculate the Smith-Waterman alignment score of two sequence lists using the PMBEC matrix."},
+  {"pmbec_norm_score", pmbec_norm_score, METH_VARARGS,
+   "Calculate the length-normalized Smith-Waterman alignment score of two sequences using the PMBEC matrix."},
+  {NULL, NULL, 0, NULL}
+};
+
+#if PY_MAJOR_VERSION >= 3
+static struct PyModuleDef PmbecAlign = {
+  PyModuleDef_HEAD_INIT,
+  "pmbecalign",
+  "",
+  -1,
+  PmbecAlignMethods
+};
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+PyObject *
+PyInit_pmbecalign(void) {
+  return PyModule_Create(&PmbecAlign);
+}
+#else
+void
+initpmbecalign(void) {
+  Py_InitModule("pmbecalign", PmbecAlignMethods);
+}
+#endif
diff --git a/pylintrc b/pylintrc
@@ -1,4 +1,6 @@
 [TYPECHECK]
-# Without ignoring this, we get errors like:
+# Without ignoring numpy, we get errors like:
 # E:249,20: Module 'numpy' has no 'nan' member (no-member)
-ignored-modules = numpy
+# Without ignoring pmbecalign, we get errors like:
+# E:158, 8: No name 'pmbec_init' in module 'pmbecalign' (no-name-in-module)
+ignored-modules = numpy,pmbecalign
diff --git a/setup.py b/setup.py
@@ -14,8 +14,11 @@
 
 from __future__ import print_function
 import os
+from os import environ, path
+import warnings
 
-from setuptools import setup, find_packages
+from setuptools import setup, find_packages, Extension
+from distutils.core import Extension
 
 readme_dir = os.path.dirname(__file__)
 readme_filename = os.path.join(readme_dir, 'README.md')
@@ -33,10 +36,33 @@
     print(
         "Conversion of long_description from MD to reStructuredText failed...")
 
+seq_align_path = environ.get('SEQ_ALIGN_PATH')
+extensions = []
+if seq_align_path:
+    seq_align_dirs = [path.join(seq_align_path, 'src'),
+                      path.join(seq_align_path, 'libs'),
+                      path.join(seq_align_path, 'libs/bit_array'),
+                      path.join(seq_align_path, 'libs/string_buffer')]
+    pmbec_align = Extension('pmbecalign',
+                            include_dirs=seq_align_dirs,
+                            library_dirs=seq_align_dirs,
+                            libraries=[
+                                'align',
+                                'strbuf',
+                                'bitarr',
+                                'pthread',
+                                'z'
+                            ],
+                            sources=['pmbec_align.c'])
+    extensions.append(pmbec_align)
+else:
+    warnings.warn("seq-align is not installed and/or SEQ_ALIGN_PATH is not set, "
+                  "so fast Smith-Waterman alignment is currently disabled.")
+
 if __name__ == '__main__':
     setup(
         name='topeology',
-        version="0.0.1",
+        version="0.0.2",
         description="Compare epitope homology",
         author="Tavi Nathanson",
         author_email="tavi {dot} nathanson {at} gmail {dot} com",
@@ -67,5 +93,6 @@
             'pylint >=1.4.4',
         ],
         long_description=readme,
+        ext_modules=extensions,
         packages=find_packages(exclude="test")
     )