# HOWTO: Filtering columns using a function

In [1]:
# Generate random FASTA alignment of 3 samples, 9 characters long to use as an example
import random

def generate_random_alignment(num_samples, seq_len, bases='ATCG'):
    sequences = ['' for i in range(num_samples)]
    for i in range(seq_len):
        char1 = random.choice(bases)
        if random.random() > 0.5:  # >0.5 means variable
            # Pick another character
            char2 = random.choice(bases)
            freq1 = random.randint(1, num_samples-1)
            freq2 = num_samples - freq1
        else:
            char2 = ''
            freq1, freq2 = num_samples, 0
        for j1 in range(freq1):
            sequences[j1] += char1
        for j2 in range(freq1, freq1+freq2):
            sequences[j2] += char2
    return sequences

def generate_random_fasta(path, num_samples, seq_len, bases='ATCG'):
    sequences = generate_random_alignment(num_samples, seq_len, bases=bases)
    with open(path, 'w') as f:
        for i, seq in enumerate(sequences):
            if i == 1:
                print('>seq{i}\n{s}'.format(i=i+1, s='N'+seq[1:]), file=f)
            else:
                print('>seq{i} description{i}\n{s}'.format(i=i+1, s=seq), file=f)

path = 'test.aln'
generate_random_fasta(path, 3, 9)

In [2]:
# Prints the contents of test.aln
with open('test.aln', 'r') as f:
    for line in f:
        print(line.rstrip())

>seq1 description1
AGTGCTCTA
>seq2
NGTGCTTTG
>seq3 description3
AGTGATTGG


## Filtering alignment columns

`Alignment.col` provides a `.filter` method that keeps alignment columns for which the given filter function, evaluating the characters in the alignment column, returns `True`. Columns that evaluate `False` are deleted from the sequence alignment. The concept is similar to the `filter` function in Python.

The behavior of the `.filter` method can be inverted by setting `inverse=True`. By default, `inverse=False` and columns evaluated `True` are kept and columns that returned `False` are removed. In contrast, when `inverse=True`, columns that evaluate `False` are kept and columns that return `True` are removed.

The `.filter` method offers additional functionality beyond the builtin `filter` function. Setting `dry_run=True` will evaluate the each alignment column against the provided filtering function but will not modify the alignment. Instead, the method will print out a summary of the number of columns that evaluated `True` or `False` against the total number of columns in the sequence alignment, and return a dictionary containing the index lists of positions that evaluated `True` or `False` respectively.

### Filtering columns inplace (default)

In [3]:
# Import Alignment module and import the data from a file into an Alignment object
# See 01_Reading_alignments.ipynb for details about importing data.
from alignmentrs import Alignment
aln = Alignment.from_fasta(path)

In [4]:
# Number of rows and columns before filtering
aln.nrows, aln.ncols

(3, 9)

In [5]:
# Column and column metadata before filtering
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [6]:
# Set a filter function
# This filter function checks whether the column contains any 'N' character
# If there is not N in the sequence, returns True; otherwise returns False.
ff = lambda x: 'N' not in ''.join(x).upper()

In [7]:
# The .filter method keeps columns where the filter function evaluated `True`,
# otherwise the column is deleted.
# Filtering inplace does not return any value
aln.col.filter(ff)

In [8]:
# Column and column metadata after filtering
# Note that column 0 has been removed
aln.column_and_metadata

Unnamed: 0,sequence
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [9]:
# Verify the number of rows and columns remaining after filtering
# The number of rows remains the same, while the number of columns changes from 9 to 8
aln.nrows, aln.ncols

(3, 8)

### Inverting the behavior of the filtering function via `inverse=True`

In [10]:
# Reimport the data from a file into an Alignment object such that
# the starting data will the same as the first.
# See 01_Reading_alignments.ipynb for details about importing data.
aln = Alignment.from_fasta(path)

In [11]:
# Number of rows and columns before filtering
aln.nrows, aln.ncols

(3, 9)

In [12]:
# Column and column metadata before filtering
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [13]:
# Set a filter function
# This filter function checks whether the column contains any 'N' character
# If there is not N in the sequence, returns True; otherwise returns False.
ff = lambda x: 'N' not in ''.join(x).upper()

In [14]:
# Setting `inverse=True` produces the opposite effect of the normal filter method.
# The .filter method in this example keeps columns that are `False`
# and deletes columns that evaluated `True`.
# Filtering inplace does not return any value
aln.col.filter(ff, inverse=True)

In [15]:
# Column and column metadata after filtering
# Note that only 'seq2' (row 1) is retained
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA


In [16]:
# Verify the number of rows and columns remaining after filtering
# The number of rows remains unchange, while the number of columns changes from 9 to 1.
aln.nrows, aln.ncols

(3, 1)

### Filtering columns without modifying the original data using `copy=True`

In [17]:
# Reimport the data from a file into an Alignment object such that
# the starting data will the same as the first.
# See 01_Reading_alignments.ipynb for details about importing data.
aln = Alignment.from_fasta(path)

In [18]:
# Row and row metadata before filtering
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [19]:
# Set a filter function
# This filter function checks whether the column contains any 'N' character
# If there is not N in the sequence, returns True; otherwise returns False.
ff = lambda x: 'N' not in x.upper()

In [20]:
# Setting `copy=True` edits the copy and keeps the original data intact
new_aln = aln.col.filter(ff, copy=True)

In [21]:
# Column and column metadata of the original alignment after filtering
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [22]:
# Column and column metadata of the NEW alignemnt
new_aln.row_and_metadata

Unnamed: 0,description,sequence
seq1,description1,GTGCTCTA
seq2,,GTGCTTTG
seq3,description3,GTGATTGG


In [23]:
# Verify the number of rows and columns in the original alignment after filtering
aln.nrows, aln.ncols

(3, 9)

In [24]:
# Number of rows and columns in the NEW alignment
new_aln.nrows, new_aln.ncols

(3, 8)

### Filtering columns with `dry_run=True`

In [25]:
# Reimport the data from a file into an Alignment object such that
# the starting data will the same as the first.
# See 01_Reading_alignments.ipynb for details about importing data.
aln = Alignment.from_fasta(path)

In [26]:
# Column and column metadata before filtering
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [27]:
# Set a filter function
# This filter function checks whether the sequence contains any 'N' character
# If there is not N in the sequence, returns True; otherwise returns False.
ff = lambda x: 'N' not in x.upper()

In [28]:
# When `dry_run=True`, the .filter method returns a dictionary containing
# the positions that returned True or False.
# Moreover, a summary of the number of columns that returned True or False compared to the
# total number of aligned columns is printed out.
positions_d = aln.col.filter(ff, dry_run=True)

[Filter]
True = 8/9
False = 1/9


In [29]:
# The position dictionary contains the list of indices of rows that evaluated `True`
# and indices of rows that evaluated `False`.
positions_d

{True: [1, 2, 3, 4, 5, 6, 7, 8], False: [0]}

In [30]:
# Column and column metadata after dry-run filtering. Note that no changes occur
aln.column_and_metadata

Unnamed: 0,sequence
0,ANA
1,GGG
2,TTT
3,GGG
4,CCA
5,TTT
6,CTT
7,TTG
8,AGG


In [31]:
# Verify the number of rows and columns in the alignment
aln.nrows, aln.ncols

(3, 9)

## Using multiple parameters simulateneously

It is possible to set `inverse` and `copy`, and `inverse` and `dry_run` simultaneously. Their effects are independent of each other, making the output predictable.

However, combining `copy` and `dry_run` does not make sense because `dry_run` does not modify the underlying data and makes creating a copy of the data unnecessary. When these are both `True`, the effect is similar to only having `dry_run` set to True.

## Next

See `03a_Filtering_rows.ipynb` for more information about using a function to select rows.

Proceed to `04_Reordering.ipynb` to know more about reordering the rows or columns in the alignment.