In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.10.1
pandas  1.1.4
numpy   1.18.4



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
import shap

import pandas as pd
import requests

from io import StringIO



In [3]:
from collections import defaultdict

url = requests.get('https://docs.google.com/spreadsheets/d/1KO_wGiEagJ8PMO2BzSDI1IXHYO4RHZMMSWXlT48peiQ/export?format=csv')
csv_raw = StringIO(url.text)
df_truth = pd.read_csv(csv_raw)

inf_tax_file = "/mnt/btrfs/data/gtdb_95/gtdb_genomes_reps_r95/r95.gtdb.tax"

# Building the true species dataset

In [4]:
df_tax = pd.read_csv(inf_tax_file, names=["assembly_accession", "tax"], sep="\t")

df_tax["species"] = [";".join(_.split(";")[:7]) for _ in df_tax.tax]
df_tax["genus"] = [";".join(_.split(";")[:6]) for _ in df_tax.tax]
df_tax["family"] = [";".join(_.split(";")[:5]) for _ in df_tax.tax]

In [5]:
dd = dict()
for group, df in df_truth.groupby('dataset'):
    dd[group] = set(df["database_accession"].values)
    if np.nan in dd[group]:
        dd[group].remove(np.nan)

In [6]:
from glob import glob
import os

files = glob("/mnt/btrfs/data/type_1/species_mc/b6_split_by_sample/*.extra.csv")

dfs = []
for file in files:
    name = '_'.join(os.path.basename(file).split('.')[:-3])
    if name == "test_sort":
        continue
    df = pd.read_csv(file, index_col = 0)
    df['dataset'] = name
    dfs.append(df)
df_type_1_features = pd.concat(dfs)

Columns (84,88,97,106) have mixed types.Specify dtype option on import or set low_memory=False.
Columns (47,49,55,56,57,58,70,83,95,96,100,103,104,108,109) have mixed types.Specify dtype option on import or set low_memory=False.
Columns (70,83,95,96,100,103,104,108,109) have mixed types.Specify dtype option on import or set low_memory=False.


In [7]:
rows = []
for t in df_type_1_features.itertuples():
    if t.assembly_accession in dd[t.dataset]:
        rows.append(True)
    else:
        rows.append(False)
df_type_1_features["truth"] = rows

In [8]:
df_type_1_features['truth'].sum()

122

In [9]:
# mask_eubacterium = [_ in eubacteriums for _ in df_type_1_features['assembly_accession']]

In [10]:
df_type_1_features.groupby('truth').mean()

Unnamed: 0_level_0,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,mean_binned_coverage,sd_binned_coverage,...,gf_mean_scaffold_length,gf_n50_contigs,gf_n50_scaffolds,gf_protein_count,gf_scaffold_count,gf_ssu_count,gf_total_gap_length,gf_trna_aa_count,gf_trna_count,gf_trna_selenocysteine_count
truth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,3239.319,0.000875,0.092289,2.53193,0.001534,0.154517,3.746302,0.002177,0.323931,6.280481,...,806109.0,1094139.0,1250438.0,3818.894454,76.263736,2.158391,9811.494846,19.337915,51.752234,0.161085
True,1880543.0,0.703774,85.043975,99.475321,0.751646,113.741787,118.544562,0.786987,188.054088,191.574473,...,2212078.0,2763453.0,2922896.0,3310.090164,13.737705,4.311475,5892.532787,19.581967,59.286885,0.139344


In [11]:
df_type_1_features.groupby('dataset').mean()

Unnamed: 0_level_0,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,mean_binned_coverage,sd_binned_coverage,...,gf_n50_contigs,gf_n50_scaffolds,gf_protein_count,gf_scaffold_count,gf_ssu_count,gf_total_gap_length,gf_trna_aa_count,gf_trna_count,gf_trna_selenocysteine_count,truth
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dual_index,99.186609,0.001536,0.004886,0.052115,0.001993,0.006192,0.070818,0.002589,0.009919,0.106076,...,1129564.0,1294797.0,3807.313737,69.550468,2.265561,10889.223855,19.431568,52.481377,0.155392,0.001193
gis_20,19253.05292,0.002234,0.58696,11.445821,0.002878,0.893255,16.0244,0.003691,1.925302,29.261207,...,1127082.0,1277582.0,3916.328516,77.123373,2.143669,6616.306147,19.360873,51.803942,0.162744,0.00101
hmp_even,1165.422709,0.00207,0.026119,0.671371,0.003497,0.073089,1.383517,0.004409,0.116542,1.97858,...,1036182.0,1183016.0,3798.242926,83.113132,2.005308,9276.677129,19.196538,50.275467,0.16372,0.000969
hmp_staggered,652.053147,0.001033,0.012736,0.459527,0.00201,0.041127,1.0113,0.002785,0.065205,1.435355,...,1057824.0,1207405.0,3821.154354,81.732126,2.043972,9402.177458,19.252472,50.632297,0.16738,0.000998
mbarc_26,6937.558789,0.002394,0.408469,1.163047,0.002622,0.38267,1.124678,0.003124,0.693755,1.796629,...,1135078.0,1306175.0,3898.53628,79.262254,2.324745,12627.015178,19.385977,54.081964,0.177699,0.001925
zymo_even,559.568718,0.001024,0.026347,0.30828,0.001395,0.032079,0.377924,0.00187,0.055957,0.594392,...,1106914.0,1267416.0,3743.073659,69.400121,2.226796,10587.274386,19.409942,52.195089,0.148348,0.000485
zymo_log,7514.99898,0.000722,0.234523,3.888986,0.001169,0.478779,6.21473,0.00169,0.751497,8.552841,...,1110680.0,1269649.0,3750.67897,69.803925,2.229324,10524.154483,19.405714,52.059537,0.152263,0.00048


In [12]:
list(df_type_1_features.columns)

['assembly_accession',
 'hits',
 'percent_coverage',
 'mean_coverage',
 'sd_coverage',
 'percent_padded_coverage',
 'mean_padded_coverage',
 'sd_padded_coverage',
 'percent_binned_coverage',
 'mean_binned_coverage',
 'sd_binned_coverage',
 'expected_percent_coverage',
 'shannon_entropy',
 'percent_max_uncovered_region',
 'largest_pileup',
 'largest_padded_pileup',
 'largest_binned_pileup',
 'gc_content',
 'total_genome_length',
 'ungapped_genome_length',
 'num_n_groups',
 'consecutive_ns',
 'gf_accession',
 'gf_ambiguous_bases',
 'gf_checkm_completeness',
 'gf_checkm_contamination',
 'gf_checkm_marker_count',
 'gf_checkm_marker_lineage',
 'gf_checkm_marker_set_count',
 'gf_checkm_strain_heterogeneity',
 'gf_coding_bases',
 'gf_coding_density',
 'gf_contig_count',
 'gf_gc_count',
 'gf_gc_percentage',
 'gf_genome_size',
 'gf_gtdb_genome_representative',
 'gf_gtdb_representative',
 'gf_gtdb_taxonomy',
 'gf_gtdb_type_designation',
 'gf_gtdb_type_designation_sources',
 'gf_gtdb_type_species

In [15]:
df_type_1_features.reset_index(drop=True).to_csv("../notebooks/strains.dataset.csv")