Skip to content

Commit

Permalink
Merge pull request #4 from hammerlab/seq-align
Browse files Browse the repository at this point in the history
Integrate seq-align into topeology
  • Loading branch information
tavinathanson committed Feb 5, 2016
2 parents 3f4b140 + da5e992 commit 598b8f8
Show file tree
Hide file tree
Showing 9 changed files with 515 additions and 101 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ before_install:
- export PATH=/home/travis/miniconda2/bin:$PATH
install:
- conda update --yes conda
- conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy nose pandas matplotlib scikit-bio
- conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy nose pandas matplotlib scikit-bio libgfortran
- git clone --recursive https://github.com/noporpoise/seq-align
- (cd seq-align && make)
- export SEQ_ALIGN_PATH=$PWD/seq-align
- pip install .
- pip install coveralls
- pip install pypandoc
Expand Down
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,20 @@ Output looks like:

## Installation

You can install topeology using [pip](https://pip.pypa.io/en/latest/quickstart.html):
You can install topeology using [pip]:

```sh
pip install topeology
```

Currently, topeology use [seq-align] to quickly compare sequences, wrapped in a C extension. It will be
installed if [seq-align] is installed; otherwise, topeology reverts to using another scorer.

To install topeology with this extension:
- Follow [seq-align]'s installation instructions, and then set `SEQ_ALIGN_PATH` to the installation
directory.
- Install topeology via [pip]. If topeology is already installed, run `pip install topeology --upgrade --no-deps --force-reinstall`.

## Methodology

Topeology uses Smith-Waterman alignment to align each neoepitope with each IEDB epitope of the
Expand All @@ -49,5 +57,8 @@ amino acid are considered.

This software uses the following libraries for Smith-Waterman alignment:

- https://github.com/noporpoise/seq-align
- https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library
- [seq-align]
- [Complete-Striped-Smith-Waterman-Library](https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library)

[seq-align]: https://github.com/noporpoise/seq-align
[pip]: https://pip.pypa.io/en/latest/quickstart.html
286 changes: 286 additions & 0 deletions pmbec_align.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
#include <Python.h>

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <ctype.h>
#include "string_buffer/string_buffer.h"
#include "smith_waterman.h"
#include <zlib.h>
#include "alignment_macros.h"
#include "alignment_scoring.h"
#include "alignment_scoring_load.h"

// Copied from seq-align source
void add_mutations(scoring_t* scoring, const char *str, int *scores,
char use_match_mismatch) {
size_t i, j, len = strlen(str);
char a, b;
int score;

for (i = 0; i < len; i++) {
a = scoring->case_sensitive ? str[i] : tolower(str[i]);
for (j = 0; j < len; j++) {
b = scoring->case_sensitive ? str[j] : tolower(str[j]);
score = ARR_LOOKUP(scores, len, i, j);

scoring_add_mutation(scoring, a, b, score);
}
}

scoring->use_match_mismatch = use_match_mismatch;
}

static char AMINO_ACIDS[] = "ARNDCQEGHILKMFPSTWYVBZX*";
static scoring_t scoring;
static bool scoring_initialized = false;

static char *
string_from_pyobject(PyObject *obj) {
#if PY_MAJOR_VERSION >= 3
return PyUnicode_AsUTF8(obj);
#else
return PyBytes_AsString(obj);
#endif
}

// Copied from http://stackoverflow.com/a/13942236
static PyObject *
make_list(double array[], size_t size) {
PyObject *l = PyList_New(size);
size_t i;
for (i = 0; i != size; ++i) {
PyList_SET_ITEM(l, i, PyFloat_FromDouble(array[i]));
}
return l;
}

static bool is_amino_acids(char *seq) {
size_t len_seq = strlen(seq);
int i;
for (i = 0; i < len_seq; i++) {
// Is this letter is an amino acid?
if (memchr(AMINO_ACIDS, toupper(seq[i]), sizeof(AMINO_ACIDS)) == NULL) {
return false;
}
}

return true;
}

int pmbec_score_int(scoring_t *scoring, char *seq_a, char *seq_b) {
sw_aligner_t *sw = smith_waterman_new();
alignment_t *result = alignment_create(256);

smith_waterman_align(seq_a, seq_b, scoring, sw);

int max_score = INT_MIN;
while(smith_waterman_fetch(sw, result)) {
if(result->score > max_score) {
max_score = result->score;
}
}

smith_waterman_free(sw);
alignment_free(result);

return max_score;
}

static PyObject *
pmbec_norm_score(PyObject *self, PyObject *args) {
if (!scoring_initialized) {
PyErr_SetString(PyExc_ValueError, "pmbec_init must be called before aligning sequences");
return NULL;
}

char *seq_a, *seq_b;
if (!PyArg_ParseTuple(args, "ss", &seq_a, &seq_b))
return NULL;

if (strlen(seq_a) != strlen(seq_b)) {
PyErr_SetString(PyExc_ValueError, "Sequence lengths must be equal");
return NULL;
}

if (!is_amino_acids(seq_a) || !is_amino_acids(seq_b)) {
PyErr_SetString(PyExc_ValueError, "Sequences must only comprise amino acids");
return NULL;
}

int val = pmbec_score_int(&scoring, seq_a, seq_b);

if (val < 0) {
return Py_None;
}

float val_norm = ((float) val / 100.0) / (float) strlen(seq_a);
return PyFloat_FromDouble(val_norm);
}

static PyObject *
pmbec_score(PyObject *self, PyObject *args) {
if (!scoring_initialized) {
PyErr_SetString(PyExc_ValueError, "pmbec_init must be called before aligning sequences");
return NULL;
}

char *seq_a, *seq_b;
if (!PyArg_ParseTuple(args, "ss", &seq_a, &seq_b))
return NULL;

if (!is_amino_acids(seq_a) || !is_amino_acids(seq_b)) {
PyErr_SetString(PyExc_ValueError, "Sequences must only comprise amino acids");
return NULL;
}

int val = pmbec_score_int(&scoring, seq_a, seq_b);

if (val < 0) {
return Py_None;
}

return PyFloat_FromDouble((float) val / 100.0);
}

static PyObject *
pmbec_score_multiple(PyObject *self, PyObject *args) {
PyObject *array_seq_a, *array_seq_b;
char **seq_a_values, **seq_b_values;
double *output;
int num_seqs_a, num_seqs_b;
int i;

if (!PyArg_ParseTuple(args, "O!O!", &PyList_Type, &array_seq_a,
&PyList_Type, &array_seq_b)) {
return NULL;
}

num_seqs_a = PyList_Size(array_seq_a);
num_seqs_b = PyList_Size(array_seq_b);
if (num_seqs_a != num_seqs_b) {
PyErr_SetString(PyExc_ValueError, "...");
return NULL;
}

seq_a_values = calloc(num_seqs_a, sizeof(char*));
seq_b_values = calloc(num_seqs_b, sizeof(char*));
for (i = 0; i < num_seqs_a; i++) {
seq_a_values[i] = (char*) string_from_pyobject(PyList_GetItem(array_seq_a, i));
seq_b_values[i] = (char*) string_from_pyobject(PyList_GetItem(array_seq_b, i));
}

output = calloc(num_seqs_b, sizeof(double));
for (i = 0; i < num_seqs_a; i++) {
output[i] = pmbec_score_int(&scoring, seq_a_values[i], seq_b_values[i]) / 100.0;
}

free(seq_a_values);
free(seq_b_values);

PyObject* output_list = make_list(output, num_seqs_a);

free(output);

return output_list;
}

static PyObject *
pmbec_init(PyObject *self, PyObject *args) {
PyObject *array;
int *pmbec_values;
int pmbec_count, gap_extend, i;

if (!PyArg_ParseTuple(args, "iO!", &gap_extend, &PyList_Type, &array)) {
return NULL;
}

// Accept only a positive value for the gap extension
if (gap_extend < 0) {
PyErr_SetString(PyExc_ValueError, "Gap extension penalty must be >= 0");
return NULL;
}

pmbec_count = PyList_Size(array);
if (pmbec_count != 576) {
PyErr_SetString(PyExc_ValueError, "Substitution matrices must be of length 576");
return NULL;
}

pmbec_values = calloc(pmbec_count, sizeof(int));
for (i = 0; i < pmbec_count; i++) {
pmbec_values[i] = (int) PyLong_AsLong(PyList_GetItem(array, i));
}

// Default match/mismatch scores; ignorable because matrix covers all scores
int match = -100;
int mismatch = -100;

// Don't ever use the defaults specified above, since the matrix covers all scores
char use_match_mismatch = 0;

// Gap penalty = gap_open + gap_extend * length of gap, so we'll just rely on gap_extend
// without any opening-specific penalties
gap_extend = gap_extend * -1; // seq-align expects negative gap penalties
int gap_open = 0;

// No special treatment for gaps at the start
char no_start_gap_penalty = 0;
char no_end_gap_penalty = 0;

// No forcing of gaplessness in either peptide
char no_gaps_in_a = 0;
char no_gaps_in_b = 0;

// Substitutions are allowed
char no_mismatches = 0;

char case_sensitive = 0;

scoring_init(&scoring, match, mismatch, gap_open, gap_extend,
no_start_gap_penalty, no_end_gap_penalty,
no_gaps_in_a, no_gaps_in_b, no_mismatches, case_sensitive);

add_mutations(&scoring, AMINO_ACIDS, pmbec_values, use_match_mismatch);

scoring_initialized = true;

free(pmbec_values);

return Py_None;
}

static PyMethodDef PmbecAlignMethods[] = {
{"pmbec_init", pmbec_init, METH_VARARGS,
"Initialize Smith-Waterman alignment scoring using the PMBEC matrix."},
{"pmbec_score", pmbec_score, METH_VARARGS,
"Calculate the Smith-Waterman alignment score of two sequences using the PMBEC matrix."},
{"pmbec_score_multiple", pmbec_score_multiple, METH_VARARGS,
"Calculate the Smith-Waterman alignment score of two sequence lists using the PMBEC matrix."},
{"pmbec_norm_score", pmbec_norm_score, METH_VARARGS,
"Calculate the length-normalized Smith-Waterman alignment score of two sequences using the PMBEC matrix."},
{NULL, NULL, 0, NULL}
};

#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef PmbecAlign = {
PyModuleDef_HEAD_INIT,
"pmbecalign",
"",
-1,
PmbecAlignMethods
};
#endif

#if PY_MAJOR_VERSION >= 3
PyObject *
PyInit_pmbecalign(void) {
return PyModule_Create(&PmbecAlign);
}
#else
void
initpmbecalign(void) {
Py_InitModule("pmbecalign", PmbecAlignMethods);
}
#endif
6 changes: 4 additions & 2 deletions pylintrc
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
[TYPECHECK]
# Without ignoring this, we get errors like:
# Without ignoring numpy, we get errors like:
# E:249,20: Module 'numpy' has no 'nan' member (no-member)
ignored-modules = numpy
# Without ignoring pmbecalign, we get errors like:
# E:158, 8: No name 'pmbec_init' in module 'pmbecalign' (no-name-in-module)
ignored-modules = numpy,pmbecalign
31 changes: 29 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@

from __future__ import print_function
import os
from os import environ, path
import warnings

from setuptools import setup, find_packages
from setuptools import setup, find_packages, Extension
from distutils.core import Extension

readme_dir = os.path.dirname(__file__)
readme_filename = os.path.join(readme_dir, 'README.md')
Expand All @@ -33,10 +36,33 @@
print(
"Conversion of long_description from MD to reStructuredText failed...")

seq_align_path = environ.get('SEQ_ALIGN_PATH')
extensions = []
if seq_align_path:
seq_align_dirs = [path.join(seq_align_path, 'src'),
path.join(seq_align_path, 'libs'),
path.join(seq_align_path, 'libs/bit_array'),
path.join(seq_align_path, 'libs/string_buffer')]
pmbec_align = Extension('pmbecalign',
include_dirs=seq_align_dirs,
library_dirs=seq_align_dirs,
libraries=[
'align',
'strbuf',
'bitarr',
'pthread',
'z'
],
sources=['pmbec_align.c'])
extensions.append(pmbec_align)
else:
warnings.warn("seq-align is not installed and/or SEQ_ALIGN_PATH is not set, "
"so fast Smith-Waterman alignment is currently disabled.")

if __name__ == '__main__':
setup(
name='topeology',
version="0.0.1",
version="0.0.2",
description="Compare epitope homology",
author="Tavi Nathanson",
author_email="tavi {dot} nathanson {at} gmail {dot} com",
Expand Down Expand Up @@ -67,5 +93,6 @@
'pylint >=1.4.4',
],
long_description=readme,
ext_modules=extensions,
packages=find_packages(exclude="test")
)

0 comments on commit 598b8f8

Please sign in to comment.