# Compress and Distance Matrix

In [1]:
data_dir = './data/genomes'
combined_dir = f'{data_dir}/combined'

overwrite = False

In [2]:
accessions = [
    'CM054508.1',      # Pan paniscus (Pygmy chimpanzee)
    'NC_001321.1',     # Balaenoptera physalus (Finback whale)
    'NC_001325.1',     # Phoca vitulina (Harbor seal)
    'NC_001601.1',     # Balaenoptera musculus (Blue whale),
    'NC_001602.1',     # Halichoerus grypus (Gray seal)
    'NC_001610.1',     # Didelphis virginiana (Opossum)
    'NC_001640.1',     # Equus caballus (Horse)
    'NC_001643.1',     # Pan troglodytes (Chimpanzee)
    'NC_001645.1',     # Gorilla (Gorilla)
    'NC_001700.1',     # Felis catus (Cat)
    'NC_001794.1',     # Macropus robustus (Wallaroo)
    'NC_001808.1',     # Ceratotherium simum (White rhinoceros)
    'NC_002083.1',     # Pongo abelii (Orangutan)
    'NC_010339.1',     # Mus musculus musculus (House mouse)
    'NC_012374.1',     # Rattus rattus (Rat)
    'OK135155.1',      # Bos taurus (Cow)
    'OM287160.1',      # Nomascus siki (Hylobatidae/Gibbon)
    'OM864526.1',      # Gallus gallus (Chicken)
    'OP605624.1'       # Homo sapiens (Human)
]

count = len(accessions)

In [3]:
common_names = [
    'Pygmy chimpanzee',
    'Finback whale',
    'Harbor seal',
    'Blue whale',
    'Gray seal',
    'Opossum',
    'Horse',
    'Chimpanzee',
    'Gorilla',
    'Cat',
    'Wallaroo',
    'White rhinoceros',
    'Orangutan',
    'House mouse',
    'Rat',
    'Cow',
    'Gibbon',
    'Chicken',
    'Human'
]

## Functions

### Compress

In [16]:
# accessions, count, data_dir, commbined_dir, fasta extension = fna
def compress_files(compressor, verbose=False):
    compressed_files = {}

    for accession in accessions:
        fasta = f'{data_dir}/{accession}.fna'
        if verbose:
            print(f'Compressing {accession}')
        compressed_file = compressor(fasta)
        compressed_files[accession] = compressed_file

    for i in range(count):
        for j in range(i+1, count):
            combined_fasta = f'{accessions[i]}+{accessions[j]}' 
            fasta = f'{combined_dir}/{combined_fasta}.fna'
            if verbose:
                print(f'Compressing {accessions[i]} and {accessions[j]}')
            compressed_file = compressor(fasta)
            compressed_files[combined_fasta] = compressed_file
    
    return compressed_files

### Distance Matrix Creation

In [17]:
import os

def distance(a, b, ab):
    """Calculates the distance measure between a and b.
    :param a: compressed a
    :param b: compressed b
    :param ab: compressed ab-merge
    :return: distance measure
    """
    a_size = os.stat(a).st_size   # file size of a
    b_size = os.stat(b).st_size   # file size of b
    ab_size = os.stat(ab).st_size # file size of ab
    
    return 1 - (((a_size + b_size) - ab_size) / ab_size)

In [18]:
# accesssions
from Bio.Phylo.TreeConstruction import DistanceMatrix
import os

def create_distance_matrix(names, compressed_files, distance=distance, verbose=False):
    distance_matrix = DistanceMatrix(names)
    count = len(accessions)
    for i in range(count):
        a = accessions[i]
        az = compressed_files[a]

        if not os.path.exists(az):
            print(f'ERROR: {az} not exists')

        distances = []
        for j in range(count):
            if i == j:
                distances.append(0)
                continue

            b = accessions[j]
            bz = compressed_files[b]

            if not os.path.exists(bz):
                print(f'ERROR: {bz} not exists')

            combined = f'{b}+{a}'
            if i < j:
                combined = f'{a}+{b}'
            abz = compressed_files[combined]

            if not os.path.exists(abz):
                print(f'ERROR: {abz} not exists')        

            d = distance(az, bz, abz)

            distances.append(d)

            if verbose:
                print(f'distance({a}, {b}) = {d}')

        distance_matrix[i] = distances

    return distance_matrix

In [19]:
import pandas as pd

def to_df(distane_matrix, names):
    return pd.DataFrame(list(distane_matrix), index=names, columns=names)

---

## GenCompress

In [20]:
compressed_dir = './data/gencompress'

def gencompress(fasta, overwrite=False):
    compressed_file = f'{compressed_dir}/{os.path.split(fasta)[-1]}.GEN'
    return compressed_file

In [21]:
gencompress_compressed_files = compress_files(gencompress)

In [23]:
gencompress_distance_matrix= create_distance_matrix(common_names, gencompress_compressed_files)

In [25]:
df = to_df(gencompress_distance_matrix, common_names)
df

Unnamed: 0,Pygmy chimpanzee,Finback whale,Harbor seal,Blue whale,Gray seal,Opossum,Horse,Chimpanzee,Gorilla,Cat,Wallaroo,White rhinoceros,Orangutan,House mouse,Rat,Cow,Gibbon,Chicken,Human
Pygmy chimpanzee,0.0,0.989427,0.998172,0.996187,0.997023,0.99046,0.995159,0.681034,0.871585,0.995434,0.992908,1.000573,0.976207,0.996157,0.995697,0.997108,0.98141,0.995069,0.869599
Finback whale,0.989427,0.0,0.997704,0.818418,0.994813,0.996326,1.008003,0.989994,0.98982,0.997138,0.992991,1.002869,0.994888,0.999067,0.99297,1.000579,0.995482,0.994352,0.996076
Harbor seal,0.998172,0.997704,0.0,1.001831,0.668786,0.994067,0.99897,0.994718,0.995732,0.994994,0.996138,1.003979,0.9947,0.995472,0.997224,0.991307,0.993671,0.993824,0.994613
Blue whale,0.996187,0.818418,1.001831,0.0,0.989912,0.999542,1.001728,0.993617,0.995808,0.997709,0.990544,1.000691,0.996633,0.996016,0.990475,0.996157,0.995011,0.991435,0.999885
Gray seal,0.997023,0.994813,0.668786,0.989912,0.0,0.998977,0.997821,1.000457,0.996885,0.988653,0.995562,0.999085,0.997123,1.003919,0.996872,0.991882,0.99851,0.995197,0.995298
Opossum,0.99046,0.996326,0.994067,0.999542,0.998977,0.0,0.990786,0.987877,0.990739,0.998075,0.987297,0.995535,0.991691,0.996642,0.995603,0.997815,0.993795,0.994292,0.991156
Horse,0.995159,1.008003,0.99897,1.001728,0.997821,0.990786,0.0,0.994336,0.995589,0.993696,0.995655,0.987205,1.001612,0.99603,0.99907,1.002192,0.997231,0.995865,1.00343
Chimpanzee,0.681034,0.989994,0.994718,0.993617,1.000457,0.987877,0.994336,0.0,0.895073,0.998974,0.990696,0.996082,0.980371,0.997553,0.994164,0.997799,0.982922,0.993904,0.877173
Gorilla,0.871585,0.98982,0.995732,0.995808,0.996885,0.990739,0.995589,0.895073,0.0,0.994128,0.992158,0.996176,0.979536,0.994357,0.994603,0.993332,0.989246,0.993637,0.924613
Cat,0.995434,0.997138,0.994994,0.997709,0.988653,0.998075,0.993696,0.998974,0.994128,0.0,0.995127,0.995544,0.996447,0.995952,0.99931,0.994473,0.995536,0.994647,0.994746


In [26]:
df.to_csv(f'{compressed_dir}/gencompress_distance-matrix.csv')

## bzip3

In [30]:
import os

compressed_dir = './data/bzip3'

if overwrite or not os.path.exists(compressed_dir):
    !rm -Rf {compressed_dir}
    !mkdir -p {compressed_dir}

def bzip3(fasta, overwrite=overwrite):
    compressed = f'{fasta}.bz3'
    compressed_file = f'{compressed_dir}/{os.path.split(fasta)[-1]}.bz3'
    if overwrite or not os.path.exists(compressed_file):
        !bzip3 {fasta}
        !mv {compressed} {compressed_dir}
    return compressed_file

In [31]:
bzip3_compressed_files = compress_files(bzip3)

In [33]:
bzip3_distance_matrix= create_distance_matrix(common_names, bzip3_compressed_files)

In [34]:
df = to_df(bzip3_distance_matrix, common_names)
df

Unnamed: 0,Pygmy chimpanzee,Finback whale,Harbor seal,Blue whale,Gray seal,Opossum,Horse,Chimpanzee,Gorilla,Cat,Wallaroo,White rhinoceros,Orangutan,House mouse,Rat,Cow,Gibbon,Chicken,Human
Pygmy chimpanzee,0.0,0.981426,0.982196,0.981998,0.982883,0.986832,0.981516,0.932654,0.960465,0.982981,0.983646,0.981967,0.97486,0.983296,0.982255,0.98256,0.975405,0.983556,0.962946
Finback whale,0.981426,0.0,0.98174,0.945171,0.981843,0.985821,0.980342,0.98255,0.983063,0.981362,0.982147,0.980321,0.983677,0.981882,0.981912,0.980073,0.983366,0.984987,0.982827
Harbor seal,0.982196,0.98174,0.0,0.981132,0.923738,0.986078,0.979246,0.982373,0.983351,0.976885,0.98244,0.980048,0.983137,0.982184,0.981977,0.980865,0.983884,0.98502,0.983232
Blue whale,0.981998,0.945171,0.981132,0.0,0.98206,0.986041,0.979133,0.982295,0.982093,0.981577,0.981659,0.98042,0.982829,0.981383,0.981174,0.979812,0.982757,0.984736,0.982809
Gray seal,0.982883,0.981843,0.923738,0.98206,0.0,0.987108,0.978522,0.982124,0.983572,0.977803,0.982426,0.980503,0.983593,0.982407,0.983027,0.98085,0.983637,0.985356,0.984036
Opossum,0.986832,0.985821,0.986078,0.986041,0.987108,0.0,0.983674,0.987371,0.98813,0.986261,0.980201,0.984125,0.989192,0.980969,0.983488,0.984962,0.989684,0.99158,0.988443
Horse,0.981516,0.980342,0.979246,0.979133,0.978522,0.983674,0.0,0.980865,0.981725,0.979465,0.978956,0.973279,0.981867,0.979702,0.98081,0.980765,0.982505,0.983891,0.981499
Chimpanzee,0.932654,0.98255,0.982373,0.982295,0.982124,0.987371,0.980865,0.0,0.963051,0.983625,0.983593,0.982735,0.975138,0.98348,0.982912,0.983452,0.974005,0.983971,0.962333
Gorilla,0.960465,0.983063,0.983351,0.982093,0.983572,0.98813,0.981725,0.963051,0.0,0.984017,0.983752,0.982417,0.974175,0.983641,0.983429,0.983255,0.974847,0.983308,0.968761
Cat,0.982981,0.981362,0.976885,0.981577,0.977803,0.986261,0.979465,0.983625,0.984017,0.0,0.98241,0.980381,0.985436,0.982155,0.981713,0.981548,0.983498,0.986477,0.983895


In [35]:
df.to_csv(f'{compressed_dir}/bzip3_distance-matrix.csv')

## fastGzip: gzip -1 -n

In [36]:
import os

compressed_dir = './data/fastgzip'

if overwrite or not os.path.exists(compressed_dir):
    !rm -Rf {compressed_dir}
    !mkdir -p {compressed_dir}

def fast_gzip(fasta, overwrite=overwrite):
    compressed = f'{fasta}.gz'
    compressed_file = f'{compressed_dir}/{os.path.split(fasta)[-1]}.gz'
    if overwrite or not os.path.exists(compressed_file):
        !gzip -1 -n {fasta}
        !cp {compressed} {compressed_dir}
        !gunzip {compressed}
    return compressed_file

In [37]:
fastgzip_compressed_files = compress_files(fast_gzip)

In [39]:
fastgzip_distance_matrix = create_distance_matrix(common_names, fastgzip_compressed_files)

In [40]:
df = to_df(fastgzip_distance_matrix, common_names)
df

Unnamed: 0,Pygmy chimpanzee,Finback whale,Harbor seal,Blue whale,Gray seal,Opossum,Horse,Chimpanzee,Gorilla,Cat,Wallaroo,White rhinoceros,Orangutan,House mouse,Rat,Cow,Gibbon,Chicken,Human
Pygmy chimpanzee,0.0,0.991633,0.991694,0.992041,0.993194,0.99181,0.990798,0.99056,0.989996,0.991048,0.991816,0.989709,0.98985,0.991073,0.990816,0.98932,0.98976,0.990344,0.989198
Finback whale,0.991633,0.0,0.988809,0.985573,0.991605,0.989108,0.987799,0.98877,0.990004,0.989701,0.989031,0.991173,0.988565,0.989618,0.989445,0.9901,0.989508,0.989586,0.989888
Harbor seal,0.991694,0.988809,0.0,0.989298,0.9844,0.990119,0.988659,0.987998,0.988963,0.99096,0.990295,0.988767,0.988049,0.989695,0.989523,0.989317,0.987785,0.989747,0.989195
Blue whale,0.992041,0.985573,0.989298,0.0,0.990986,0.990706,0.988808,0.98883,0.9905,0.990361,0.99165,0.98995,0.988019,0.988463,0.990028,0.987479,0.988705,0.989819,0.990123
Gray seal,0.993194,0.991605,0.9844,0.990986,0.0,0.991361,0.990425,0.988391,0.989189,0.99051,0.99086,0.990102,0.987669,0.989751,0.99044,0.988166,0.988695,0.988779,0.989505
Opossum,0.99181,0.989108,0.990119,0.990706,0.991361,0.0,0.988362,0.990339,0.990031,0.990068,0.989659,0.991441,0.988949,0.991873,0.991361,0.990637,0.989199,0.991728,0.991103
Horse,0.990798,0.987799,0.988659,0.988808,0.990425,0.988362,0.0,0.989135,0.988558,0.990061,0.990498,0.988534,0.988931,0.989466,0.991452,0.988655,0.989184,0.989692,0.989565
Chimpanzee,0.99056,0.98877,0.987998,0.98883,0.988391,0.990339,0.989135,0.0,0.989957,0.991183,0.989583,0.990781,0.987404,0.991039,0.991727,0.98816,0.990149,0.989371,0.9889
Gorilla,0.989996,0.990004,0.988963,0.9905,0.989189,0.990031,0.988558,0.989957,0.0,0.991731,0.990976,0.990731,0.987422,0.989863,0.990556,0.988269,0.989405,0.989141,0.99013
Cat,0.991048,0.989701,0.99096,0.990361,0.99051,0.990068,0.990061,0.991183,0.991731,0.0,0.990749,0.990759,0.989456,0.990159,0.990416,0.990463,0.989109,0.990712,0.99093


In [41]:
df.to_csv(f'{compressed_dir}/fastgzip_distance-matrix.csv')

## genozip: genozip --best --multiseq --quiet --no-tip --force

In [57]:
import os

compressed_dir = './data/genozip'

if overwrite or not os.path.exists(compressed_dir):
    !rm -Rf {compressed_dir}
    !mkdir -p {compressed_dir}


def genozip(fasta, overwrite=overwrite):
    compressed = f'{fasta}.genozip'
    compressed_file = f'{compressed_dir}/{os.path.split(fasta)[-1]}.genozip'
    if overwrite or not os.path.exists(compressed_file):
        !genozip --best --multiseq --quiet --no-tip --force {fasta}
        !mv {compressed} {compressed_dir}
    return compressed_file

In [58]:
genozip_compressed_files = compress_files(genozip)

In [59]:
genozip_distance_matrix = create_distance_matrix(common_names, genozip_compressed_files)

In [60]:
df = to_df(genozip_distance_matrix, common_names)
df

Unnamed: 0,Pygmy chimpanzee,Finback whale,Harbor seal,Blue whale,Gray seal,Opossum,Horse,Chimpanzee,Gorilla,Cat,Wallaroo,White rhinoceros,Orangutan,House mouse,Rat,Cow,Gibbon,Chicken,Human
Pygmy chimpanzee,0.0,0.753149,0.756864,0.752919,0.756619,0.758305,0.760064,0.464438,0.574019,0.757015,0.756508,0.756287,0.643937,0.74836,0.754316,0.731193,0.631481,0.732098,0.53447
Finback whale,0.753149,0.0,0.677599,0.421333,0.688705,0.705579,0.668686,0.702451,0.702808,0.693488,0.70426,0.659172,0.703539,0.739401,0.701572,0.698789,0.746381,0.746298,0.750834
Harbor seal,0.756864,0.677599,0.0,0.663407,0.349059,0.709343,0.642839,0.706264,0.707564,0.617892,0.708854,0.627561,0.707033,0.742859,0.706669,0.699732,0.752294,0.749282,0.753554
Blue whale,0.752919,0.421333,0.663407,0.0,0.667741,0.705949,0.654111,0.701457,0.7016,0.668588,0.703066,0.638938,0.702755,0.73126,0.700994,0.686271,0.745651,0.746651,0.750601
Gray seal,0.756619,0.688705,0.349059,0.667741,0.0,0.708965,0.643661,0.705986,0.706974,0.617325,0.708167,0.627582,0.706548,0.741915,0.706806,0.699025,0.74961,0.750097,0.753497
Opossum,0.758305,0.705579,0.709343,0.705949,0.708965,0.0,0.711234,0.707266,0.706802,0.70841,0.680072,0.708017,0.707415,0.743231,0.704016,0.748687,0.751814,0.752195,0.754429
Horse,0.760064,0.668686,0.642839,0.654111,0.643661,0.711234,0.0,0.708879,0.708004,0.640285,0.708367,0.585496,0.709031,0.746402,0.707524,0.6916,0.751954,0.752238,0.755629
Chimpanzee,0.464438,0.702451,0.706264,0.701457,0.705986,0.707266,0.708879,0.0,0.500796,0.70647,0.70491,0.703882,0.576237,0.741555,0.701813,0.746538,0.64398,0.74744,0.531008
Gorilla,0.574019,0.702808,0.707564,0.7016,0.706974,0.706802,0.708004,0.500796,0.0,0.706523,0.705897,0.705285,0.575555,0.742022,0.704388,0.74603,0.649342,0.749589,0.564258
Cat,0.757015,0.693488,0.617892,0.668588,0.617325,0.70841,0.640285,0.70647,0.706523,0.0,0.706999,0.624767,0.70703,0.742762,0.701766,0.697112,0.750524,0.750039,0.753048


In [61]:
df.to_csv(f'{compressed_dir}/genozip_distance-matrix.csv')

## fastgenozip: genozip --fast --multiseq --quiet --no-tip --force

In [62]:
import os

compressed_dir = './data/fastgenozip'

if overwrite or not os.path.exists(compressed_dir):
    !rm -Rf {compressed_dir}
    !mkdir -p {compressed_dir}


def fastgenozip(fasta, overwrite=overwrite):
    compressed = f'{fasta}.genozip'
    compressed_file = f'{compressed_dir}/{os.path.split(fasta)[-1]}.genozip'
    if overwrite or not os.path.exists(compressed_file):
        !genozip --fast --multiseq --quiet --no-tip --force {fasta}
        !mv {compressed} {compressed_dir}
    return compressed_file

In [63]:
fastgenozip_compressed_files = compress_files(fastgenozip)

In [52]:
fastgenozip_distance_matrix = create_distance_matrix(common_names, fastgenozip_compressed_files)

In [66]:
df = to_df(fastgenozip_distance_matrix, common_names)
df

Unnamed: 0,Pygmy chimpanzee,Finback whale,Harbor seal,Blue whale,Gray seal,Opossum,Horse,Chimpanzee,Gorilla,Cat,Wallaroo,White rhinoceros,Orangutan,House mouse,Rat,Cow,Gibbon,Chicken,Human
Pygmy chimpanzee,0.0,0.759447,0.762615,0.758673,0.763116,0.76412,0.769271,0.474367,0.759613,0.762387,0.756202,0.75724,0.758463,0.754926,0.759063,0.734805,0.736531,0.737808,0.537605
Finback whale,0.759447,0.0,0.707522,0.42653,0.707465,0.708527,0.705372,0.704315,0.715006,0.707362,0.700752,0.702469,0.705556,0.745313,0.71246,0.749374,0.751345,0.753047,0.756308
Harbor seal,0.762615,0.707522,0.0,0.707808,0.353167,0.712362,0.717513,0.709143,0.709265,0.710479,0.703739,0.704106,0.708828,0.750058,0.707636,0.760613,0.752939,0.755386,0.767525
Blue whale,0.758673,0.42653,0.707808,0.0,0.70837,0.708607,0.705556,0.704395,0.705341,0.708058,0.700624,0.701615,0.704175,0.735512,0.705045,0.748962,0.759454,0.751962,0.756863
Gray seal,0.763116,0.707465,0.353167,0.70837,0.0,0.71057,0.709198,0.7096,0.709209,0.709507,0.704402,0.703639,0.706916,0.748259,0.707889,0.753255,0.753267,0.754753,0.758823
Opossum,0.76412,0.708527,0.712362,0.708607,0.71057,0.0,0.709436,0.71169,0.72112,0.720872,0.703907,0.704892,0.722591,0.747321,0.705111,0.753786,0.766511,0.758637,0.762006
Horse,0.769271,0.705372,0.717513,0.705556,0.709198,0.709436,0.0,0.715752,0.706502,0.718332,0.712254,0.702987,0.706378,0.746069,0.706522,0.750702,0.751108,0.75271,0.758251
Chimpanzee,0.474367,0.704315,0.709143,0.704395,0.7096,0.71169,0.715752,0.0,0.70482,0.707645,0.700527,0.70131,0.704386,0.748366,0.704942,0.750762,0.751752,0.752676,0.537315
Gorilla,0.759613,0.715006,0.709265,0.705341,0.709209,0.72112,0.706502,0.70482,0.0,0.708277,0.700833,0.710501,0.704182,0.757035,0.705997,0.749628,0.751016,0.751849,0.765412
Cat,0.762387,0.707362,0.710479,0.708058,0.709507,0.720872,0.718332,0.707645,0.708277,0.0,0.713345,0.705697,0.708254,0.748998,0.706855,0.752531,0.75389,0.754986,0.758096


In [67]:
df.to_csv(f'{compressed_dir}/fastgenozip_distance-matrix.csv')