In [2]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

seaborn 0.10.1
pandas  1.1.4
numpy   1.19.5



In [11]:
from glob import glob
import os

features_dir = "/mnt/btrfs/data/type_1/assembly_wgs/features_no_tree_cap/"

files = glob(features_dir + "*.csv")

dfs = []
for file in files:
    name = os.path.basename(file).split('.')[0]
    df = pd.read_csv(file, index_col = 0)
    df['dataset'] = name
    dfs.append(df)
df_type_1_features = pd.concat(dfs)

  and should_run_async(code)


In [12]:
df_type_1_features

Unnamed: 0,assembly_accession,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,mean_binned_coverage,...,tree_gf_ssu_silva_blast_bitscore,tree_gf_ssu_silva_blast_evalue,tree_gf_ssu_silva_blast_perc_identity,tree_gf_ssu_silva_blast_subject_id,tree_gf_ssu_silva_taxonomy,tree_gf_total_gap_length,tree_gf_trna_aa_count,tree_gf_trna_count,tree_gf_trna_selenocysteine_count,dataset
0,GCF_003114855.1,412384,0.999609,30.935455,8.703978,0.999008,20.870225,6.508909,0.9999,41.2384,...,2739,0,99.799,AB476706.1.1492,Bacteria;Spirochaetes;Leptospirae;Leptospirale...,41,20,35,0,DRR124443
1,GCF_901456055.1,2355,0.000977,0.122706,4.207252,0.000942,0.081812,2.841415,0.0010,0.2355,...,2839,0,100,FLYB01000015.15.1556,Bacteria;Proteobacteria;Gammaproteobacteria;En...,0,18,70,1,DRR124443
2,GCF_001408515.1,617,0.001354,0.048223,1.384365,0.001328,0.032149,0.932019,0.0014,0.0617,...,2712,0,100,CYRX01000003.598.2082,Bacteria;Proteobacteria;Alphaproteobacteria;Rh...,0,19,41,0,DRR124443
3,GCF_000751595.1,892,0.001052,0.104349,4.025162,0.000974,0.069583,2.757037,0.0011,0.0892,...,2584,0,98.176,KF193147.1.1480,Bacteria;Bacteroidetes;Bacteroidia;Bacteroidal...,0,20,42,0,DRR124443
4,GCF_900659615.1,4,0.000055,0.000110,0.016569,0.000037,0.000073,0.013529,0.0002,0.0004,...,2782,0,100,CBRG010000486.287.1792,Bacteria;Actinobacteria;Actinobacteria;Strepto...,0,20,65,0,DRR124443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193,GCF_000160475.1,1,0.000060,0.000060,0.007747,0.000076,0.000076,0.008743,0.0001,0.0001,...,2724,0,99.271,ADVK01000034.12.1519,Bacteria;Fusobacteria;Fusobacteriia;Fusobacter...,0,19,45,0,SRR960193
194,GCF_000369505.1,1,0.000046,0.000046,0.006816,0.000057,0.000057,0.007573,0.0001,0.0001,...,2820,0,99.935,HE651915.1.1530,Bacteria;Proteobacteria;Gammaproteobacteria;Ps...,26261,20,61,0,SRR960193
195,GCA_900554645.1,1,0.000133,0.000133,0.011540,0.000179,0.000179,0.013369,0.0001,0.0001,...,none,none,none,none,none,0,19,43,0,SRR960193
196,GCA_900549105.1,1,0.000050,0.000050,0.007063,0.000064,0.000064,0.007997,0.0001,0.0001,...,2715,0,99.333,JQ084698.1.1503,Bacteria;Firmicutes;Clostridia;Clostridiales;L...,0,20,56,0,SRR960193


In [17]:
df_assembly_summary = pd.read_csv("../data/assembly_summary.sra.csv")

  and should_run_async(code)


In [21]:
df_merged = pd.merge(df_type_1_features, df_assembly_summary, left_on="dataset", right_on="Run", how="inner")

  and should_run_async(code)


In [24]:
df_merged = df_merged.loc[df_merged["in_gtdb"]]

  and should_run_async(code)


In [34]:
df_merged["truth"] = (df_merged["# assembly_accession"].values == df_merged["assembly_accession"].values)

df_type_1_features = df_merged.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged["truth"] = (df_merged["# assembly_accession"].values == df_merged["assembly_accession"].values)


In [38]:
from glob import glob

import joblib

classifiers = []

for file in glob("../data/clf.sklearn.*"):
    clf = joblib.load(file)
    classifiers.append(clf)

In [42]:
features = ['hits',
 'percent_coverage',
 'mean_coverage',
 'sd_coverage',
 'percent_binned_coverage',
 'mean_binned_coverage',
 'sd_binned_coverage',
 'expected_percent_coverage',
 'shannon_entropy',
 'percent_max_uncovered_region',
 'largest_pileup',
 'largest_binned_pileup',
 'gc_content',
 'total_genome_length',
 'ungapped_genome_length',
 'num_n_groups',
 'consecutive_ns',
 'tree_dist',
 'tree_top_dist',
 'gf_checkm_completeness',
 'gf_checkm_contamination',
 'relative_abundance',
 'tree_hits',
 'tree_percent_coverage',
 'tree_mean_coverage',
 'tree_sd_coverage',
 'tree_percent_binned_coverage',
 'tree_mean_binned_coverage',
 'tree_sd_binned_coverage',
 'tree_expected_percent_coverage',
 'tree_shannon_entropy',
 'tree_percent_max_uncovered_region',
 'tree_largest_pileup',
 'tree_largest_binned_pileup',
 'tree_dist',
 'tree_top_dist'
]

df_type_1_features['relative_abundance'] = df_type_1_features['hits'] / df_type_1_features.groupby('dataset')['hits'].transform('sum')

df_type_1_features.reset_index(inplace=True, drop=True)
df_type_1_features['dataset_cat'] = pd.Series([_.split("_")[0] for _ in df_type_1_features['dataset']], dtype='category')

categories = df_type_1_features['dataset_cat'].cat.categories

X = df_type_1_features[features + ["assembly_accession", "dataset", "truth", "dataset_cat"]]

X = X.loc[:, features].copy()

  and should_run_async(code)


In [45]:
predictions = classifiers[0].predict(X)

  and should_run_async(code)


In [52]:
accuracy = (df_merged["truth"] == predictions).mean()

  and should_run_async(code)


In [53]:
accuracy

  and should_run_async(code)


0.9997499062148305

In [55]:
df_merged["truth"].sum()

53

In [54]:
predictions.sum()

55

In [60]:
predictions.sum()

  and should_run_async(code)


array([ True, False, False, ..., False, False, False])

In [59]:
df_merged.loc[:, "truth"] == False

  and should_run_async(code)


0        False
1         True
2         True
3         True
4         True
         ...  
13581     True
13582     True
13583     True
13584     True
13585     True
Name: truth, Length: 7997, dtype: bool

In [67]:
df_merged.columns

  and should_run_async(code)


Index(['assembly_accession', 'hits', 'percent_coverage', 'mean_coverage',
       'sd_coverage', 'percent_padded_coverage', 'mean_padded_coverage',
       'sd_padded_coverage', 'percent_binned_coverage', 'mean_binned_coverage',
       ...
       'Tumor', 'CenterName', 'Submission', 'Consent', 'RunHash', 'ReadHash',
       'LibraryName', 'Subject_ID', 'AssemblyName', 'truth'],
      dtype='object', length=329)

In [74]:
df_merged.groupby("dataset")["truth"].sum()

  and should_run_async(code)


dataset
DRR124443      1
ERR1448082     1
SRR10209490    1
SRR1587382     1
SRR1592616     1
SRR3314034     1
SRR3710112     2
SRR3721749     1
SRR3948005     1
SRR4096349     1
SRR4096573     1
SRR4096622     1
SRR4136409     1
SRR4136423     1
SRR4136541     1
SRR4156105     1
SRR4156230     1
SRR4159523     1
SRR4163324     1
SRR4163857     1
SRR4181631     1
SRR4181708     1
SRR4217059     1
SRR4218335     1
SRR4233880     1
SRR4233886     1
SRR4233890     1
SRR4233940     1
SRR4235445     1
SRR4237007     1
SRR4237030     1
SRR4996333     1
SRR5889151     1
SRR6007417     1
SRR6479433     1
SRR6479487     1
SRR6479505     1
SRR6480665     1
SRR7081650     1
SRR7174863     1
SRR7178576     1
SRR769340      2
SRR8393270     1
SRR8436120     2
SRR8552796     2
SRR8640888     1
SRR8742574     1
SRR9332242     1
SRR960193      1
Name: truth, dtype: int64

In [75]:
df_merged.query("assembly_accession == 'GCF_000621165.1'")
# "GCF_000424785.1"

Unnamed: 0,assembly_accession,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,mean_binned_coverage,...,Tumor,CenterName,Submission,Consent,RunHash,ReadHash,LibraryName,Subject_ID,AssemblyName,truth
3542,GCF_000621165.1,11418,0.255164,0.297206,0.55029,0.336669,0.414907,0.651589,0.671,1.1418,...,no,JGI,SRA401434,public,AB01EF63FD99606B34945862A76C9A5B,9435D03FA095A3E0B774B5DB8827FA05,OPXA,,,False


In [72]:
df_merged.loc[predictions == (df_merged.loc[:, "truth"] == False).values]

  and should_run_async(code)


Unnamed: 0,assembly_accession,hits,percent_coverage,mean_coverage,sd_coverage,percent_padded_coverage,mean_padded_coverage,sd_padded_coverage,percent_binned_coverage,mean_binned_coverage,...,Tumor,CenterName,Submission,Consent,RunHash,ReadHash,LibraryName,Subject_ID,AssemblyName,truth
3542,GCF_000621165.1,11418,0.255164,0.297206,0.55029,0.336669,0.414907,0.651589,0.671,1.1418,...,no,JGI,SRA401434,public,AB01EF63FD99606B34945862A76C9A5B,9435D03FA095A3E0B774B5DB8827FA05,OPXA,,,False
4203,GCF_000424785.1,25741,0.497184,0.707686,0.865444,0.476624,0.666729,0.840653,0.9113,2.5741,...,no,JGI,SRA436102,public,3190921889CB101D52590CF17D097E89,F7A890B7A67B405C9AA1117D1ED9D045,NBSU,,,False


In [76]:
df_merged.loc[predictions == (df_merged.loc[:, "truth"] == False).values, "dataset"]

  and should_run_async(code)


3542    SRR3314034
4203    SRR3721749
Name: dataset, dtype: object

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import PredefinedSplit

from sklearn.metrics import average_precision_score, f1_score, precision_score, recall_score
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt



In [36]:
features = ['hits',
 'percent_coverage',
 'mean_coverage',
 'sd_coverage',
 'percent_binned_coverage',
 'mean_binned_coverage',
 'sd_binned_coverage',
 'expected_percent_coverage',
 'shannon_entropy',
 'percent_max_uncovered_region',
 'largest_pileup',
 'largest_binned_pileup',
 'gc_content',
 'total_genome_length',
 'ungapped_genome_length',
 'num_n_groups',
 'consecutive_ns',
 'tree_dist',
 'tree_top_dist',
 'gf_checkm_completeness',
 'gf_checkm_contamination',
 'relative_abundance',
 'tree_hits',
 'tree_percent_coverage',
 'tree_mean_coverage',
 'tree_sd_coverage',
 'tree_percent_binned_coverage',
 'tree_mean_binned_coverage',
 'tree_sd_binned_coverage',
 'tree_expected_percent_coverage',
 'tree_shannon_entropy',
 'tree_percent_max_uncovered_region',
 'tree_largest_pileup',
 'tree_largest_binned_pileup',
 'tree_dist',
 'tree_top_dist'
]

df_type_1_features['relative_abundance'] = df_type_1_features['hits'] / df_type_1_features.groupby('dataset')['hits'].transform('sum')

df_type_1_features.reset_index(inplace=True, drop=True)
df_type_1_features['dataset_cat'] = pd.Series([_.split("_")[0] for _ in df_type_1_features['dataset']], dtype='category')

categories = df_type_1_features['dataset_cat'].cat.categories

X = df_type_1_features[features + ["assembly_accession", "dataset", "truth", "dataset_cat"]]

cv = PredefinedSplit(X['dataset_cat'].cat.codes)
X = X.loc[:, features].copy()
X.reset_index(inplace=True, drop=True)

y = df_type_1_features.loc[:, "truth"]
y.reset_index(inplace=True, drop=True)

from sklearn.feature_selection import RFECV

# rfc = RandomForestClassifier(n_estimators=100, max_features=.2, min_samples_leaf=17, min_samples_split=9, bootstrap=False, criterion="gini", class_weight="balanced")

logistic = LogisticRegression(penalty="l1", solver="liblinear", fit_intercept=True, dual=False, tol=0.001, class_weight="balanced")

min_features_to_select = 5

rfecv = RFECV(logistic, step=1, cv=cv, scoring="f1", min_features_to_select=min_features_to_select, n_jobs=40)
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE


exported_pipeline = make_pipeline(
    RFE(LogisticRegression(penalty="l1", solver="liblinear", fit_intercept=True, dual=False, tol=0.001, class_weight="balanced")),
    MinMaxScaler(),
    MLPClassifier(alpha=0.01, learning_rate_init=0.01)
)

  and should_run_async(code)


KeyboardInterrupt: 

In [47]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold

precisions = []
average_precisions = []
mean_recall = np.linspace(0, 1, 100)
classifiers = []

X_transf = rfecv.transform(X)
X_transf = X.copy().values
# X_transf = X.copy().values
# X.reset_index(inplace=True, drop=True)
y.reset_index(inplace=True, drop=True)

for i, (train, test) in enumerate(cv.split(X_transf, y)):
#     clf = RandomForestClassifier(n_estimators=100, max_features=.2, min_samples_leaf=17, min_samples_split=9, bootstrap=False, criterion="gini", class_weight="balanced_subsample")
#     clf = ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy', max_features=0.8, min_samples_leaf=4, min_samples_split=20, n_estimators=100)
    clf = make_pipeline(
            MinMaxScaler(),
            RFECV(LogisticRegression(penalty="l1", solver="liblinear", fit_intercept=True, dual=False, tol=0.001, class_weight="balanced"), step=1, cv=StratifiedKFold(random_state=930525, shuffle=True), scoring="f1", min_features_to_select=5, n_jobs=40),
            MLPClassifier(alpha=0.01, learning_rate_init=0.01)
        )

#     clf = LogisticRegression(penalty="l1", solver="liblinear", fit_intercept=True, dual=False, tol=0.001, class_weight="balanced")
    clf.fit(X_transf[train], y.loc[train])
    classifiers.append(clf)

  and should_run_async(code)
