In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas  1.1.4
numpy   1.18.4
seaborn 0.10.1



In [2]:
import pandas as pd
import requests

from io import StringIO

In [3]:
from collections import defaultdict

url = requests.get('https://docs.google.com/spreadsheets/d/1KO_wGiEagJ8PMO2BzSDI1IXHYO4RHZMMSWXlT48peiQ/export?format=csv')
csv_raw = StringIO(url.text)
df_truth = pd.read_csv(csv_raw)

inf_tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

# Building the true species dataset

In [4]:
df_tax = pd.read_csv(inf_tax_file, names=["assembly_accession", "tax"], sep="\t")

df_tax["species"] = [";".join(_.split(";")[:7]) for _ in df_tax.tax]
df_tax["genus"] = [";".join(_.split(";")[:6]) for _ in df_tax.tax]
df_tax["family"] = [";".join(_.split(";")[:5]) for _ in df_tax.tax]

In [5]:
accession_to_genus = dict()
for t in df_tax.itertuples():
    accession_to_genus[t.assembly_accession] = t.genus.split(";")[-1]

In [6]:
dd = defaultdict(set)

dd_genus = defaultdict(set)
for group, df in df_truth.groupby('dataset'):
    mask_nan = df_truth['database_accession'].astype(str) == 'nan'
    
    for row in df.loc[mask_nan].itertuples():
        # get the genus of the nans
        dd_genus[group].add("g__" + row.name.replace("_", " ").split()[0])
        dd_genus[group].add("g__" + row.homotypic_synonym.replace("_", " ").split()[0])
    
    dd[group] = set(df.loc[~mask_nan, "database_accession"].values)

In [18]:
from glob import glob
import os

files = glob("/mnt/btrfs/data/type_1/species_mc/b6_split_by_sample/*.extra.tree.csv")

dfs = []
for file in files:
    name = '_'.join(os.path.basename(file).split('.')[:-4])
    if name == "test_sort":
        continue
    df = pd.read_csv(file, index_col = 0)
    df['dataset'] = name
    dfs.append(df)
df_type_1_features = pd.concat(dfs)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [19]:
df_type_1_features.shape

(140027, 133)

In [8]:
rows = []
for t in df_type_1_features.itertuples():
    if t.assembly_accession in dd[t.dataset]:
        rows.append(True)
    else:
        rows.append(False)
df_type_1_features["truth"] = rows

In [9]:
# list(df_type_1_features.columns)

In [10]:
mask_missing = []
for t in df_type_1_features.itertuples():
    genus = accession_to_genus[t.assembly_accession]
    mask_missing.append(genus in dd_genus[t.dataset])
df_type_1_features = df_type_1_features.loc[~np.array(mask_missing)]

In [11]:
df_type_1_features['truth'].sum()

122

In [12]:
# mask_eubacterium = [_ in eubacteriums for _ in df_type_1_features['assembly_accession']]

In [13]:
df_type_1_features.groupby('truth').mean()

Unnamed: 0_level_0,tree_dist,tree_top_dist,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,...,tree_gf_mean_scaffold_length,tree_gf_n50_contigs,tree_gf_n50_scaffolds,tree_gf_protein_count,tree_gf_scaffold_count,tree_gf_ssu_count,tree_gf_total_gap_length,tree_gf_trna_aa_count,tree_gf_trna_count,tree_gf_trna_selenocysteine_count
truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,0.125182,1.930329,4989.363381,0.001572,0.177328,2.604657,0.002261,0.266959,3.824381,0.002928,...,805997.0,1094047.0,1250359.0,3818.663117,76.265308,2.158435,9812.52455,19.337907,51.751752,0.161089
True,0.045831,1.377049,151816.45082,0.086042,5.959077,31.298903,0.133718,9.599308,46.275548,0.175852,...,2212078.0,2763453.0,2922896.0,3310.090164,13.737705,4.311475,5892.532787,19.581967,59.286885,0.139344


In [14]:
df_type_1_features.groupby('dataset').mean()

Unnamed: 0_level_0,tree_dist,tree_top_dist,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,...,tree_gf_n50_contigs,tree_gf_n50_scaffolds,tree_gf_protein_count,tree_gf_scaffold_count,tree_gf_ssu_count,tree_gf_total_gap_length,tree_gf_trna_aa_count,tree_gf_trna_count,tree_gf_trna_selenocysteine_count,truth
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dual_index,0.120691,1.989008,108.931851,0.001763,0.005549,0.05278,0.002234,0.006904,0.071412,0.00287,...,1129564.0,1294797.0,3807.313737,69.550468,2.265561,10889.223855,19.431568,52.481377,0.155392,0.001193
gis_20,0.1189,1.896629,19213.013666,0.002178,0.587345,11.423644,0.002844,0.894045,15.995214,0.003692,...,1126497.0,1277080.0,3914.881261,77.134425,2.143943,6620.545411,19.360842,51.800808,0.162767,0.00101
hmp_even,0.132927,1.831895,1205.671267,0.002222,0.02715,0.670012,0.003703,0.07591,1.370516,0.004653,...,1036182.0,1183016.0,3798.242926,83.113132,2.005308,9276.677129,19.196538,50.275467,0.16372,0.000969
hmp_staggered,0.13235,1.850019,664.930262,0.001119,0.013075,0.456796,0.002148,0.042195,0.99804,0.002964,...,1057824.0,1207405.0,3821.154354,81.732126,2.043972,9402.177458,19.252472,50.632297,0.16738,0.000998
mbarc_26,0.116898,2.056123,6640.071598,0.002374,0.44556,1.222147,0.002614,0.416823,1.181332,0.003119,...,1135078.0,1306175.0,3898.53628,79.262254,2.324745,12627.015178,19.385977,54.081964,0.177699,0.001925
zymo_even,0.122589,1.984238,625.484874,0.001166,0.029422,0.303774,0.001548,0.035656,0.372526,0.002037,...,1106914.0,1267416.0,3743.073659,69.400121,2.226796,10587.274386,19.409942,52.195089,0.148348,0.000485
zymo_log,0.126145,1.982295,8035.484096,0.000805,0.25267,3.925084,0.00128,0.515691,6.302995,0.001824,...,1110680.0,1269649.0,3750.67897,69.803925,2.229324,10524.154483,19.405714,52.059537,0.152263,0.00048


In [15]:
list(df_type_1_features.columns)

['assembly_accession',
 'tree_closest_assembly_accession',
 'tree_dist',
 'tree_top_dist',
 'hits',
 'percent_coverage',
 'mean_coverage',
 'sd_coverage',
 'percent_padded_coverage',
 'mean_padded_coverage',
 'sd_padded_coverage',
 'percent_binned_coverage',
 'mean_binned_coverage',
 'sd_binned_coverage',
 'expected_percent_coverage',
 'shannon_entropy',
 'percent_max_uncovered_region',
 'largest_pileup',
 'largest_padded_pileup',
 'largest_binned_pileup',
 'gc_content',
 'total_genome_length',
 'ungapped_genome_length',
 'num_n_groups',
 'consecutive_ns',
 'gf_accession',
 'gf_ambiguous_bases',
 'gf_checkm_completeness',
 'gf_checkm_contamination',
 'gf_checkm_marker_count',
 'gf_checkm_marker_lineage',
 'gf_checkm_marker_set_count',
 'gf_checkm_strain_heterogeneity',
 'gf_coding_bases',
 'gf_coding_density',
 'gf_contig_count',
 'gf_gc_count',
 'gf_gc_percentage',
 'gf_genome_size',
 'gf_gtdb_genome_representative',
 'gf_gtdb_representative',
 'gf_gtdb_taxonomy',
 'gf_gtdb_type_desig

In [16]:
df_type_1_features.reset_index(drop=True).to_csv("../data/strains.dataset.tree.csv")