# Chapter 6 Filtering Data

In [3]:
# Common imports
import numpy as np
import os
# Where to save the data
PROJECT_ROOT_DIR = "./data/chap_6/"

## 6.2 STORY: WORKING WITH RNA-SEQ OUTPUT DATA

### 6.2.2 Example Python Session

In [6]:
tracking = open(os.path.join(PROJECT_ROOT_DIR,"transcripts.tracking"),"r")
out_file = open(os.path.join(PROJECT_ROOT_DIR,"transcripts-filtered.tracking"),"w")

for track in tracking:
    columns = track.strip().split("\t")
    wildtype = columns[4:7].count("-")
    treatment = columns[7:10].count("-")
    if wildtype < 2 and treatment < 2:
        out_file.write(track)

tracking.close()
out_file.close()

### 6.3.2 Combining Two Data Sets

In [8]:
data_a = [1, 2, 3, 4, 5, 6]
data_b = [1, 5, 7, 8, 9]
a_and_b = []

for num in data_a:
    if num in data_b:
        a_and_b.append(num)
print(a_and_b)

[1, 5]


If the order is not relevant, you can make the code shorter by using the set data type

In [10]:
data_a = set([1, 2, 3, 4, 5, 6])
data_b = set([1, 5, 7, 8, 9])

a_and_b = data_a.intersection(data_b)
print(list(a_and_b))

[1, 5]


### 6.3.3 Differences between Two Data Sets

In [13]:
data_a = [1, 2, 3, 4, 5, 6]
data_b = [1, 5, 7, 8, 9]
a_not_b = []
b_not_a = []

for num in data_a:
    if num not in data_b:
        a_not_b.append(num)

for num in data_b:
    if num not in data_a:
        b_not_a.append(num)

print(a_not_b)
print(b_not_a)

[2, 3, 4, 6]
[7, 8, 9]


In [17]:
data_a = set([1, 2, 3, 4, 5, 6])
data_b = set([1, 5, 7, 8, 9])

a_not_b = data_a.difference(data_b)
b_not_a = data_b.difference(data_a)
print(a_not_b)
print(b_not_a)

{2, 3, 4, 6}
{8, 9, 7}


### 6.3.4 Removing from Lists, Dictionaries, and Files
Deleting Particular Lines from a Text File

In [18]:
line = open(os.path.join(PROJECT_ROOT_DIR,"text.txt"),"r").readlines()
open(os.path.join(PROJECT_ROOT_DIR,"new.txt"),"w").writelines(line[2:4]+line[6:])

In [19]:
in_file = open(os.path.join(PROJECT_ROOT_DIR,"text.txt"),"r")
out_file = open(os.path.join(PROJECT_ROOT_DIR,"new.txt"),"w")
index = 0
indices_to_remove = [1,2,5,6]
for line in in_file:
    index += 1
    if index not in indices_to_remove:
        out_file.write(line)

out_file.close()

In [20]:
out_file = open(os.path.join(PROJECT_ROOT_DIR,"new.txt"),"w")
indices_to_remove = [1,2,5,6]

for index,line in enumerate(open(os.path.join(PROJECT_ROOT_DIR,"text.txt"),"r")):
    if (index+1) not in indices_to_remove:
        out_file.write(line)
out_file.close()

### 6.3.5 Removing Duplicates Preserving and Not Preserving Order

**Selectively Remove Duplicate Records from a Text File Preserving Order**

In [23]:
input_file = open(os.path.join(PROJECT_ROOT_DIR,"UniprotID.txt"),"r")
output_file = open(os.path.join(PROJECT_ROOT_DIR,"UniprotID-unique.txt"),"w")
unique = []

for line in input_file:
    if line not in unique:
        output_file.write(line)
        unique.append(line)
output_file.close()

** Selectively Remove Duplicate Records from a Text File without Preserving Order **

In [25]:
input_file = open(os.path.join(PROJECT_ROOT_DIR,"UniprotID.txt"),"r")
output_file = open(os.path.join(PROJECT_ROOT_DIR,"UniprotID-unique.txt"),"w")
unique = set(input_file)
for line in unique:
    output_file.write(line)

** How to Remove Sequences with More Than 90% Identity **

** CD-HIT (CLUSTER DATABASE AT HIGH IDENTITY WITH TOLERANCE) **

    cd-hit -i redundant_set -o nr-90 -c 0.9 -n 5

## 6.4 EXAMPLES  

### Example 6.1 Comparing More Than Two Sets of Data

In [28]:
from functools import reduce

a = set((1,2,3,4,5))
b = set((2, 4, 6, 7, 1))
c = set((1, 4, 5, 9))

triple_set = [a,b,c]
common = reduce(set.intersection,triple_set)
print(common)

{1, 4}


In [34]:
multiply = lambda x,y:x*y

print(reduce(multiply,(1,2,3,4)))

24


In [35]:
multiply(4,6)

24

### Example 6.2 Compare/Update Different Releases of a Database

In [36]:
# read old database release
old_db = set()
for line in open(os.path.join(PROJECT_ROOT_DIR,"list_old.txt"),"r"):
    accession = line.strip()
    old_db.add(accession)
#read new database release
new_db = set()
for line in open(os.path.join(PROJECT_ROOT_DIR,"list_new.txt"),"r"):
    accession = line.strip()
    new_db.add(accession)

# Report different
new_entries = new_db.difference(old_db)
print("New entries",list(new_entries))
old_entries = old_db.difference(new_db)
print("Deprecated entries",list(old_entries))
unique_entries = new_db.symmetric_difference(old_db)
print("Unique entries",list(unique_entries))

New entries ['n', 'p', 'q', 's', 'o', 'r', 'm']
Deprecated entries ['c']
Unique entries ['n', 'p', 'q', 's', 'o', 'c', 'r', 'm']


# 6.5 TESTING YOURSELF

## Exercise 6.1 Copy Only Selected FASTA Records to a File

Read a multiple sequence FASTA file and copy to a new file the ID (one per line) of the sequences starting with a methionine.

**Hint**: You can use what you learned in Chapter 4 about FASTA file parsing.

**Hint**: Since you have to check the type of the first residue, you need to collect not the whole sequence of each record but just its first character.

In [39]:
output_file = open(os.path.join(PROJECT_ROOT_DIR,"Startwith_M.fasta"),"w")
seq = ''

for line in open(os.path.join(PROJECT_ROOT_DIR,"SwissProt.fasta"),"r"):
    if line.startswith(">") and seq == '':
        ac = line.split("|")[1]
    elif not line.startswith(">"):
        seq += line
    elif line.startswith(">") and seq != '':
        if seq.startswith("M"):
            output_file.write(ac)
        ac = line.split("|")[1]
        seq = ''
        
output_file.close()

## Exercise 6.2

Remove the even (or the odd) lines from a text file of your choice.

**Hint**: You can use the % operator, which returns the remainder of a division:

    >>> 7%2
    1

If you use a line counter, the remainder of the division by 2 will be 0 for even lines (and 1 for odd lines).

In [46]:
x = list(range(10))
for i,j in enumerate(x):
    if i%2==0:  #print odd line
        print(j)

0
2
4
6
8


In [45]:
5%2

1

## Exercise 6.3 Finding Differences between Files Having the Same Number of Lines

Write a program that reads two text files and prints their differences (line by line).

**Hint**: Use the file method readlines() to put the lines of each file into a list. If you do this separately for the two files you want to compare, you will end up with two lists. Use counters to count how many lines (i.e., list elements) are identical in the two files (i.e., in the two lists), how many are present in the first file and absent in the second, and vice versa.

## Exercise 6.4 A More Sophisticated Way of Printing Differences between Files 

Implement the program of Exercise 6.3 in order to print lines present in the first file and absent in the second preceded by a “>”, lines present in the second and absent in the first preceded by a “<”, and lines present in both files preceded by a “#”.

## Exercise 6.5 A Further Filter for Transcripts

Modify the Python session in Section 6.2.2 to retain only transcripts that are expressed in at least three samples, regardless if WT or T.