In [5]:
import time

# Not Classified Analysis

The goal of this notebook is analyzing and plotting the difference in the number of labelled reads between the outputs of different classifiers.
In particular, we WON'T FOCUS ON PRECISION but only on the number of classified reads, on the reads that are not classified by all the examinated classifiers and those that are labelled by only one or some classifiers and not by others and so on. So, we want to plot the consistency between the different outputs of the classifiers and understand if our reassignment procedure is meaningfull or not.

## Provided Classifiers outputs --> The goal is boosting their recall (i.e., precision)

In [6]:
# all_250000 dataset
classifier_path_centrifuge = "../../Bio_Project/SimDataset/classifiers_results/strex_centrifuge_250000.res"
classifier_path_kraken1 = "../../Bio_Project/SimDataset/classifiers_results/strex_kraken1_250000.res"
classifier_path_kraken2 = "../../Bio_Project/SimDataset/classifiers_results/strex_kraken2_250000.res"
classifier_path_clark_species = "../../Bio_Project/SimDataset/classifiers_results/strex_clark_species_250000.res"
classifier_path_clark_genus = "../../Bio_Project/SimDataset/classifiers_results/strex_clark_genus_250000.res"

In [7]:
# load the provided classifiers outputs
centrifuge = load_classifier_result(classifier_path_centrifuge)
kraken1 = load_classifier_result(classifier_path_kraken1)
kraken2 = load_classifier_result(classifier_path_kraken2)
clark_species = load_classifier_result(classifier_path_clark_species)
clark_genus = load_classifier_result(classifier_path_clark_genus)

## SINGLE CLASSIFIER

##### Classifications obtained by our reassignment procedure --> Single Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) --> ST

In [8]:
# paths to ST classifiers outputs
classifier_path_centrifuge_ST = "../pythonProgram/all_250000_1.fq_strex_centrifuge_25000.totalReassignment.res"

In [9]:
# load ST classifiers outputs
centrifuge_ST = load_classifier_result(classifier_path_centrifuge_ST)

##### Classifications obtained by our reassignment procedure --> Single Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) --> SP

In [10]:
# paths to SP classifiers outputs
classifier_path_centrifuge_SP = "../pythonProgram/all_250000_1.fq_strex_centrifuge_25000.partialReassignment.res"

In [11]:
# load SP classifiers outputs
centrifuge_SP = load_classifier_result(classifier_path_centrifuge_SP)

##### Classifications obtained by our reassignment procedure --> Single Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) + ZERO trick --> STZ

In [12]:
# paths to STZ classifiers outputs
classifier_path_centrifuge_STZ = "../pythonProgram/all_250000_1.fq_strex_centrifuge_25000.totalReassignment.zero_version.res"

In [13]:
# load STZ classifiers outputs
centrifuge_STZ = load_classifier_result(classifier_path_centrifuge_STZ)

##### Classifications obtained by our reassignment procedure --> Single Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) + ZERO trick --> SPZ

In [14]:
# paths to SPZ classifiers outputs
classifier_path_centrifuge_SPZ = "../pythonProgram/all_250000_1.fq_strex_centrifuge_25000.partialReassignment.zero_version.res"

In [15]:
# load SPZ classifiers outputs
centrifuge_SPZ = load_classifier_result(classifier_path_centrifuge_SPZ)

## MULTI CLASSIFIER

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) + VERSION 1

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) + VERSION 2

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) + VERSION 1 

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) + VERSION 2 

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) + VERSION 1 + ZERO trick

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + TOTAL Reassignment (Major Vote Rule) + VERSION 2 + ZERO trick

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) + VERSION 1 + ZERO trick

##### Classifications obtained by our reassignment procedure --> Multi Classifier Output + LiME_binning + PARTIAL Reassignment (Major Vote Rule) + VERSION 2 + ZERO trick

## ANALYSIS

The following block will be about the comparisons of the classification output obtained by applying our procedure to centrifuge classification output.

In [23]:
non_labelled_centrifuge = non_classified_reads(centrifuge)
print("non labelled reads centrifuge: ", non_labelled_centrifuge[1])

non_labelled_centrifuge_ST = non_classified_reads(centrifuge_ST)
print("non labelled reads centrifuge_ST: ", non_labelled_centrifuge_ST[1])

non_labelled_centrifuge_SP = non_classified_reads(centrifuge_SP)
print("non labelled reads centrifuge_SP: ", non_labelled_centrifuge_SP[1])

non_labelled_centrifuge_STZ = non_classified_reads(centrifuge_STZ)
print("non labelled reads centrifue_STZ: ", non_labelled_centrifuge_STZ[1])

non_labelled_centrifuge_SPZ = non_classified_reads(centrifuge_SPZ)
print("non labelled reads centrifuge_SPZ: ", non_labelled_centrifuge_SPZ[1])

non labelled reads centrifuge:  345022
non labelled reads centrifuge_ST:  332220
non labelled reads centrifuge_SP:  274914
non labelled reads centrifue_STZ:  57557
non labelled reads centrifuge_SPZ:  57557


In [62]:
common_non_labelled_reads_ST = intersection(non_labelled_centrifuge[0], non_labelled_centrifuge_ST[0])
print("Common non labelled reads ST: ", len(common_non_labelled_reads_ST))

common_non_labelled_reads_SP = intersection(non_labelled_centrifuge[0], non_labelled_centrifuge_SP[0])
print("Common non labelled reads SP: ", len(common_non_labelled_reads_SP))
      
common_non_labelled_reads_STZ = intersection(non_labelled_centrifuge[0], non_labelled_centrifuge_STZ[0])
print("Common non labelled reads STZ: ", len(common_non_labelled_reads_STZ))
      
common_non_labelled_reads_SPZ = intersection(non_labelled_centrifuge[0], non_labelled_centrifuge_SPZ[0])
print("Common non labelled reads SPZ: ", len(common_non_labelled_reads_SPZ))

Common non labelled reads ST:  274974
Common non labelled reads SP:  274914
Common non labelled reads STZ:  57557
Common non labelled reads SPZ:  57557


In [27]:
# intersection ST and SP
common_non_labelled_reads_ST_SP = intersection(non_labelled_centrifuge_ST[0], non_labelled_centrifuge_SP[0])
print("Common non labelled reads b/t centrifuge_ST and centrifuge_SP: ", len(common_non_labelled_reads_ST_SP))

Common non labelled reads centrifuge_ST and centrifuge_SP:  273710


In [28]:
# intersection STZ and SPZ
common_non_labelled_reads_STZ_SPZ = intersection(non_labelled_centrifuge_STZ[0], non_labelled_centrifuge_SPZ[0])
print("Common non labelled reads b/t centrifuge_STZ and centrifuge_SPZ: ", len(common_non_labelled_reads_STZ_SPZ))

Common non labelled reads b/t centrifuge_STZ and centrifuge_SPZ:  57557


#### Not labelled by centrifuge labelled by centrifuge_ST

In [40]:
difference_std_ST = difference(non_labelled_centrifuge[0], non_labelled_centrifuge_ST[0])
print("Number of reads not labelled by centrifuge that are labelled by centrifuge_ST: ", len(difference_std_ST))

Number of reads not labelled by centrifuge that are labelled by centrifuge_ST:  70048


#### Not labelled by centrifuge_ST, labelled by centrifuge

In [60]:
difference_ST_std = difference(non_labelled_centrifuge_ST[0], non_labelled_centrifuge[0])
print("Number of reads not labelled by centrifuge_ST that are labelled by centrifuge: ", len(difference_ST_std))

Number of reads not labelled by centrifuge_ST that are labelled by centrifuge:  57246


#### Check if numbers are OK

In [59]:
if len(difference_std_ST) + len(common_non_labelled_reads_ST) == non_labelled_centrifuge[1]:
    print(non_labelled_centrifuge[1], end = ' ')
    print("--> OK")
if len(difference_ST_std) + len(common_non_labelled_reads_ST) == non_labelled_centrifuge_ST[1]:
    print(non_labelled_centrifuge_ST[1], end = ' ')
    print("--> OK")

345022 --> OK
332220 --> OK


## Absolute inprovement in the number of unlabelled reads

In [31]:
# centrifuge vs centrifuge_ST
improvement_ST = non_labelled_centrifuge[1] - non_labelled_centrifuge_ST[1]
print("CENTRIFUGE CLASSIFICATION improvement in number of classified reads with ST: ", improvement_ST)

# centrifuge vs centrifuge_SP
improvement_SP = non_labelled_centrifuge[1] - non_labelled_centrifuge_SP[1]
print("CENTRIFUGE CLASSIFICATION improvement in number of classified reads with SP: ", improvement_SP)

CENTRIFUGE CLASSIFICATION improvement in number of classified reads with ST:  12802
CENTRIFUGE CLASSIFICATION improvement in number of classified reads with SP:  70108


In [71]:
for i in range (0, 20):
    print(difference_std_ST[i], "--> centrifuge: ", centrifuge[difference_std_ST[i]], " centrifuge_ST: ", centrifuge_ST[difference_std_ST[i]])
print(" ")
for i in range (0, 20):
    print(difference_ST_std[i], "--> centrifuge_ST", centrifuge_ST[difference_ST_std[i]], " centrifuge: ", centrifuge[difference_ST_std[i]])

taxid_1042876.661 --> centrifuge:  0  centrifuge_ST:  1196325
taxid_1042876.971 --> centrifuge:  0  centrifuge_ST:  1235689
taxid_1042876.1601 --> centrifuge:  0  centrifuge_ST:  1196325
taxid_1042876.1782 --> centrifuge:  0  centrifuge_ST:  1215088
taxid_1042876.2003 --> centrifuge:  0  centrifuge_ST:  264730
taxid_1042876.2451 --> centrifuge:  0  centrifuge_ST:  487
taxid_1042876.2557 --> centrifuge:  0  centrifuge_ST:  487
taxid_1042876.2995 --> centrifuge:  0  centrifuge_ST:  1215088
taxid_1042876.3043 --> centrifuge:  0  centrifuge_ST:  76759
taxid_1042876.3566 --> centrifuge:  0  centrifuge_ST:  316
taxid_1042876.3622 --> centrifuge:  0  centrifuge_ST:  287
taxid_1042876.3768 --> centrifuge:  0  centrifuge_ST:  487
taxid_1042876.4128 --> centrifuge:  0  centrifuge_ST:  76759
taxid_1042876.4491 --> centrifuge:  0  centrifuge_ST:  1331671
taxid_1042876.5099 --> centrifuge:  0  centrifuge_ST:  1331671
taxid_1042876.5330 --> centrifuge:  0  centrifuge_ST:  487
taxid_1042876.5759 --> 

### UTILITY FUNCTIONS

#### Load classifier result function
It returns a dictionary where:

- key = read id
- value = class found by classifier

In [2]:
def load_classifier_result(path):

    classification = open(path, 'r')
    classifier_results = {}

    for line in classification:
        col = []
        for j in range(0, len(line.split())):
            col.append(line.split()[j])
        
        classifier_results[col[0]] = col[1]

    classification.close()

    return classifier_results 

#### Non classified reads function
It returns a list with two elements:

- dict of with reads_id of non classified reads
- number of non classified reads

In [3]:
def non_classified_reads(classifier_output):
    
    non_classified_reads = {}
    
    for read in classifier_output.keys():
        if classifier_output[read] == '0':
            non_classified_reads[read] = ''
     
    return [non_classified_reads, len(non_classified_reads)]
        

#### Intersection function
It returns a list with the reads present in both dictionaries:

In [4]:
def intersection(dict1, dict2):
    
    common_reads = []
    
    for read in dict1.keys():
        if read in dict2.keys():
            common_reads.append(read)
    
    return common_reads        

#### Difference function
It returns a list of reads that are present in dict1 and not in dict2, NOT the reads in dict2 that are not present in dict1.

In [33]:
def  difference(dict1, dict2):
    
    non_shared_reads = []
    
    for read in dict1.keys():
        if read not in dict2.keys():
            non_shared_reads.append(read)
    
    return non_shared_reads