# Chapter 8 Sorting Data

In [2]:
# Common imports
import numpy as np
import os
# Where to save the data
PROJECT_ROOT_DIR = "./data/chap_8/"

## 8.2 STORY: SORT A DATA TABLE

### 8.2.2 Example Python Session

In [6]:
from operator import itemgetter
table = []

for line in open(os.path.join(PROJECT_ROOT_DIR,"random_distribution.tsv"),'r'):
    columns = line.split('\t')
    columns = [float(x) for x in columns]
    table.append(columns)

column = 1
table_sorted = sorted(table,key=itemgetter(column),reverse=True)

for row in table_sorted:
    row = [str(x) for x in row]
    print('\t'.join(row))

6164.0	167.0	0.0270927968851	33.0	0.00535366645036	275.0	0.0446138870863
6173.0	160.0	0.0259193260975	39.0	0.00631783573627	241.0	0.0390409849344
6091.0	151.0	0.024790674766	42.0	0.00689541947135	255.0	0.0418650467903
6115.0	151.0	0.0246933769419	33.0	0.00539656582175	237.0	0.038757154538
6165.0	150.0	0.0243309002433	29.0	0.00470397404704	264.0	0.0428223844282
6143.0	148.0	0.024092462966	43.0	0.00699983721309	253.0	0.0411850887189
6098.0	147.0	0.024106264349	47.0	0.00770744506396	284.0	0.0465726467694
6148.0	147.0	0.023910214704	40.0	0.00650618087183	262.0	0.0426154847105
6147.0	146.0	0.0237514234586	36.0	0.00585651537335	261.0	0.0424597364568
6121.0	146.0	0.0238523117138	42.0	0.00686162391766	247.0	0.0403528835158
6107.0	146.0	0.0239069919764	31.0	0.00507614213198	268.0	0.0438840674636
6105.0	146.0	0.0239148239148	60.0	0.00982800982801	278.0	0.0455364455364
6120.0	144.0	0.0235294117647	38.0	0.0062091503268	286.0	0.0467320261438
6053.0	144.0	0.0237898562696	27.0	0.00446059805055	277.0	

## 8.4 EXAMPLES

### Example 8.1 Sort a Table by the First Column, Then by the Second, Then by the Third, and So On

In [9]:
from operator import itemgetter

in_file = open(os.path.join(PROJECT_ROOT_DIR,"random_distribution.tsv"),'r') 
table = []
for line in in_file:
    columns = line.split()
    columns = [float(x) for x in columns]
    table.append(columns)
table_sorted = sorted(table, key=itemgetter(0,1,2,3,4,5,6),reverse=True)

for row in table_sorted:
    row = [str(x) for x in row]
    print('\t'.join(row))

6237.0	135.0	0.021645021645	53.0	0.00849767516434	283.0	0.0453743787077
6232.0	102.0	0.0163671373556	56.0	0.00898587933248	288.0	0.0462130937099
6216.0	76.0	0.0122265122265	50.0	0.00804375804376	256.0	0.041184041184
6214.0	96.0	0.0154489861603	41.0	0.00659800450595	264.0	0.0424847119408
6213.0	122.0	0.0196362465798	38.0	0.00611620795107	266.0	0.0428134556575
6213.0	82.0	0.013198132947	28.0	0.00450667954289	273.0	0.0439401255432
6212.0	133.0	0.0214101738571	50.0	0.00804893754024	218.0	0.0350933676755
6212.0	126.0	0.0202833226014	42.0	0.00676110753381	286.0	0.0460399227302
6210.0	132.0	0.0212560386473	30.0	0.0048309178744	289.0	0.04653784219
6208.0	111.0	0.0178801546392	21.0	0.00338273195876	271.0	0.0436533505155
6208.0	107.0	0.0172358247423	43.0	0.00692654639175	262.0	0.0422036082474
6205.0	116.0	0.0186946011281	36.0	0.0058017727639	251.0	0.0404512489927
6205.0	102.0	0.0164383561644	13.0	0.00209508460919	262.0	0.0422240128928
6204.0	133.0	0.0214377820761	34.0	0.00548033526757	293.0	0.04

### Example 8.2 Sort the Output of BLAST According to a Parameter of Your Choice
(e.g., Sequence Identity Percentage)

In [12]:
from operator import itemgetter

input_file = open(os.path.join(PROJECT_ROOT_DIR,"BlastOut.csv"),'r') 
output_file = open(os.path.join(PROJECT_ROOT_DIR,"BlastOutSorted.csv"),'w') 

table = []
for line in input_file:
    col = line.split(',')
    col[2] = float(col[2])
    table.append(col)

table_sorted = sorted(table,key=itemgetter(2),reverse=True)

for row in table_sorted:
    row = [str(x) for x in row]
    output_file.write('\t'.join(row)+'\n')
output_file.close()

### Example 8.3 Sort Hemoglobin PDB Entries on the Basis of Their RMSD (from a RCSB Report)

In [25]:
from operator import itemgetter

input_file = open(os.path.join(PROJECT_ROOT_DIR,"PDBhaemoglobinReport.csv"),'r') 
output_file = open(os.path.join(PROJECT_ROOT_DIR,"PDBhaemoglobinSorted.csv"),'w') 

table = []

header = input_file.readline()

for line in input_file:
    col = line.split(',')
    col[3] = float(col[3][1:-1])
    col[4] = int(col[4][1:-2])
    table.append(col)

table_sorted = sorted(table, key=itemgetter(3,4))
for row in table_sorted:
    row = [str(x) for x in row]
    output_file.write('\t'.join(row)+'\n')
output_file.close()

## 8.5 TESTING YOURSELF

### Exercise 8.1 Sorting a Table by Its Second Column
Write a program that reads the table with Lowry data (see Table 7.2) from
a text file, sorts it by the second column, and writes the first three rows of
the sorted table to a new file.

In [35]:
from operator import itemgetter

input_file = open(os.path.join(PROJECT_ROOT_DIR,"lowry_data.txt"),'r') 
output_file = open(os.path.join(PROJECT_ROOT_DIR,"lowry_data_sorted.txt"),'w')

table = []
header = input_file.readline().rstrip().split('\t')

for line in input_file:
    col = line.rstrip().split('\t')
    col[1] = float(col[1])
    table.append(col)

table_sorted = sorted(table,key=itemgetter(1),reverse=True)

table_sorted.insert(0,header)

for row in table_sorted[:4]:
    row = [str(x) for x in row]
    output_file.write('\t'.join(row)+'\n')

### Exercise 8.2 Sorting by Sequence Length

Sort a multiple sequence FASTA file by the sequence length (from the longest to the shortest).

**Hint**: You first have to parse the file as you learned in Chapter 4 and create
a list of lists, each line containing three elements (header, sequence,
sequence length). Then you can sort the list according to the third element
of the sublists and finally write the sorted list to a file.

In [42]:
from operator import itemgetter

input_file = open(os.path.join(PROJECT_ROOT_DIR,"SwissProt.fasta"),'r') 
seq = ''
table = []
for line in input_file:
    if line.startswith('>') and seq == '':
        ac = line.rstrip().split('|')[1]
    elif not line.startswith('>'):
        seq += line.rstrip()
    elif line.startswith('>') and seq != '':
        table.append((ac,len(seq)))
        ac = line.rstrip().split('|')[1]
        seq = ''

table.append((ac, len(seq)))
table_sorted = sorted(table,key=itemgetter(1),reverse=True)

for row in table_sorted:
    row = [str(x) for x in row]
    print('\t'.join(row)+'\n')

P09208	2144

P15127	1383

P06213	1382

P15208	1372

P24062	1370

P08069	1367

P05019	195

P01344	180

P08025	153

P01308	110

