In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.10.1
numpy   1.18.4
pandas  1.0.4



In [2]:
import requests

from io import StringIO

In [25]:
from collections import defaultdict

url = requests.get('https://docs.google.com/spreadsheets/d/1OYgLNTSd1BvFyGk9ps6xEaEk1KGoPx5VIgb_awHUXR4/export?format=csv')
csv_raw = StringIO(url.text)
df_truth = pd.read_csv(csv_raw)

inf_tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

In [26]:
inf_archaea_summary = "/mnt/btrfs/data/type_1/gtdbtk_strains_lin/classify/gtdbtk.ar122.summary.tsv"

inf_bacteria_summary = "/mnt/btrfs/data/type_1/gtdbtk_strains_lin/classify/gtdbtk.bac120.summary.tsv"


In [27]:
df_archaea_summary = pd.read_csv(inf_archaea_summary, sep="\t")
df_bacteria_summary = pd.read_csv(inf_bacteria_summary, sep="\t")

In [28]:
df_summary = pd.concat([df_archaea_summary, df_bacteria_summary])

In [43]:
df_summary.head()

Unnamed: 0,user_genome,classification,fastani_reference,fastani_reference_radius,fastani_taxonomy,fastani_ani,fastani_af,closest_placement_reference,closest_placement_radius,closest_placement_taxonomy,...,closest_placement_af,pplacer_taxonomy,classification_method,note,"other_related_references(genome_id,species_name,radius,ANI,AF)",aa_percent,translation_table,red_value,warnings,mapping_genome
0,GCA_000182965.lin.noplasmid,d__Archaea;p__Thermoproteota;c__Nitrososphaeri...,,,,,,GCA_003661605.1,95.0,d__Archaea;p__Thermoproteota;c__Nitrososphaeri...,...,0.0,d__Archaea;p__Thermoproteota;c__Nitrososphaeri...,taxonomic classification defined by topology a...,,,52.69,11,0.453623,Genome has more than 10.7% of markers with mul...,GCA_000182965
1,scerevisiae_pb.lin.noplasmid,d__Archaea;p__Asgardarchaeota;c__Heimdallarcha...,,,,,,,,,...,,d__Archaea;p__Asgardarchaeota;c__Heimdallarcha...,taxonomic novelty determined using RED,,,37.65,11,0.372364,Genome has more than 27.0% of markers with mul...,scerevisiae_pb
0,GCA_000006885.lin.noplasmid,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,GCF_001457635.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,98.67,0.91,GCF_001457635.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,...,0.91,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_002087075.1, s__Streptococcus pseudopneumo...",98.02,11,,,GCA_000006885
1,GCA_000006785.lin.noplasmid,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,GCF_002055535.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,99.78,0.96,GCF_002055535.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,...,0.96,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_000188315.1, s__Streptococcus dysgalactiae...",97.94,11,,,GCA_000006785
2,GCA_000007265.lin.noplasmid,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,GCF_000186445.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,98.41,0.85,GCF_000186445.1,95.0,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,...,0.85,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,taxonomic classification defined by topology a...,topological placement and ANI have congruent s...,"GCF_000188035.1, s__Streptococcus pseudoporcin...",97.88,11,,,GCA_000007265


In [51]:
df_summary['mapping_genome'] = ['.'.join(_.split('.')[:-2]).replace("GCA", "GCF") for _ in df_summary['user_genome']]

In [52]:
df_joined['mapping_genome']

0                                                   NaN
1                                         GCF_000016305
2                                         GCF_001077675
3                                         GCF_000007645
4                                         GCF_000174395
5                                         GCF_000743055
6                                         GCF_005221305
7                                         GCF_000023285
8                                         GCF_000010005
9                                         GCF_000196555
10                                        GCF_000237805
11                                        GCF_000010185
12                                        GCF_000025565
13                                        GCF_000154325
14                                        GCF_000010425
15                                        GCF_000164675
16                                        GCF_000153625
17                                        GCF_00

In [53]:
df_summary['mapping_genome']

0                                        GCF_000182965
1                                       scerevisiae_pb
0                                        GCF_000006885
1                                        GCF_000006785
2                                        GCF_000007265
3                                        GCF_000007465
4                                        GCF_000174395
5                                         efaecalis_pb
6                                        GCF_000172575
7                                        GCF_000010005
8                                 lfermentum_ontlumina
9                                    lmonocytogenes_pb
10                                       GCF_000196035
11                                       GCF_009873455
12                                       GCF_000007645
13    Staphylococcus_aureus_subsp_aureus_ATCC_BAA_1718
14                                          saureus_pb
15                                        bsubtilis_pb
16        

In [54]:
df_joined = pd.merge(df_truth, df_summary, how="left", on="mapping_genome")

In [55]:
db_accession = []
for accession, reference in zip(df_joined['database_accession'], df_joined['fastani_reference']):
    if type(accession) == type(" "):
        db_accession.append(accession)
    elif type(reference) == type(" "):
        db_accession.append(reference)
    else:
        db_accession.append("")

In [61]:
df_joined['database_accession'] = db_accession

In [62]:
df_joined.to_csv("/mnt/btrfs/data/type_1/strains.truth.csv", index=False)
df_joined.to_csv("./strains.truth.csv", index=False)

In [67]:
df_joined['mapping_genome']

0                                                   NaN
1                                         GCF_000016305
2                                         GCF_001077675
3                                         GCF_000007645
4                                         GCF_000174395
5                                         GCF_000743055
6                                         GCF_005221305
7                                         GCF_000023285
8                                         GCF_000010005
9                                         GCF_000196555
10                                        GCF_000237805
11                                        GCF_000010185
12                                        GCF_000025565
13                                        GCF_000154325
14                                        GCF_000010425
15                                        GCF_000164675
16                                        GCF_000153625
17                                        GCF_00

['',
 'GCF_000742135',
 'GCF_002811175',
 'GCF_006742205',
 'GCF_001544255',
 'GCF_000006945',
 'GCF_005221305',
 'GCF_000023285',
 'GCF_000016825',
 'GCF_000196555',
 'GCF_000237805',
 'GCF_000010185',
 'GCF_000025565',
 'GCF_000154325',
 'GCF_000010425',
 'GCF_000164675',
 'GCF_001457555',
 'GCF_900637655',
 'GCF_000155475',
 'GCF_001042675',
 'GCA_001274925',
 '',
 'GCF_000392875',
 'GCF_002950215',
 'GCF_000159215',
 'GCF_000307025',
 'GCF_001457615',
 '',
 'GCF_000006945',
 'GCF_001027105',
 'GCA_001274925',
 '',
 'GCF_000392875',
 'GCF_002950215',
 'GCF_000159215',
 'GCF_000307025',
 'GCF_001457615',
 '',
 'GCF_000006945',
 'GCF_001027105',
 'GCF_002811175',
 'GCF_000154225',
 'GCF_001883995',
 'GCF_000012825',
 '',
 'GCF_002006445',
 'GCF_000008565',
 'GCF_000392875',
 'GCF_001542625',
 'GCF_900478295',
 'GCF_000014425',
 'GCF_900187225',
 'GCF_000016525',
 'GCF_900638555',
 'GCF_003030305',
 'GCF_001457615',
 'GCF_000012905',
 'GCF_001027105',
 'GCF_006742205',
 'GCF_000186445'

In [75]:
df_joined['assembly_in_db'] = df_joined['mapping_genome'] == np.array([_[0] for _ in df_joined['database_accession'].str.split('.')])

In [76]:
df_joined.groupby('dataset')['assembly_in_db'].sum()

dataset
dual_index        4.0
gis_20            9.0
hmp_even          5.0
hmp_staggered     5.0
mbarc_26         23.0
zymo_even         0.0
zymo_log          0.0
Name: assembly_in_db, dtype: float64