In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
import seaborn as sns

from scipy.stats import pearsonr

sns.set_style("darkgrid")
np.random.seed(930525)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 200)

warnings.simplefilter('once')

%matplotlib inline
%load_ext watermark
%watermark --iversions

pandas    : 1.1.5
seaborn   : 0.11.1
matplotlib: 3.3.4
numpy     : 1.20.1



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import plot_roc_curve
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import PredefinedSplit

  and should_run_async(code)


In [3]:
df_type_1_features = pd.read_csv("../data/strains.dataset.tree.csv", index_col=0)

  and should_run_async(code)
  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_type_1_features['dataset_cat'] = pd.Series([_.split("_")[0] for _ in df_type_1_features['dataset']], dtype='category')
CV = PredefinedSplit(df_type_1_features['dataset_cat'].cat.codes)

In [5]:
features = ['hits',
 'percent_coverage',
 'mean_coverage',
 'sd_coverage',
 'percent_binned_coverage',
 'mean_binned_coverage',
 'sd_binned_coverage',
 'expected_percent_coverage',
 'shannon_entropy',
 'percent_max_uncovered_region',
 'largest_pileup',
 'largest_binned_pileup',
 'gc_content',
 'total_genome_length',
 'ungapped_genome_length',
 'num_n_groups',
 'consecutive_ns',
 'tree_dist',
 'tree_top_dist',
 'gf_checkm_completeness',
 'gf_checkm_contamination'
]

In [7]:
X_type_1 = df_type_1_features[features + ["assembly_accession", "dataset", "truth", "dataset_cat"]]
                                                 
X_type_1 = X_type_1.replace([np.inf, -np.inf], np.nan)
X_type_1 = X_type_1.dropna()

y = X_type_1["truth"]
X = X_type_1.loc[:, X_type_1.columns.difference(["assembly_accession", "dataset", "dataset_cat", "truth"])]

  and should_run_async(code)


KeyError: 'data'

In [None]:
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_features=.2, min_samples_leaf=17, min_samples_split=9, bootstrap=False, criterion="gini")

logistic = LogisticRegression(penalty="l1", solver="liblinear", fit_intercept=True, dual=False, tol=0.001)

min_features_to_select = 5

In [None]:
rfecv = RFECV(rfc, step=1, cv=CV, scoring="f1", min_features_to_select=min_features_to_select, n_jobs=40)

In [None]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

In [None]:
rfecv.fit(X, y)

In [None]:
rfecv.transform(X)

In [None]:
X.columns[rfecv.ranking_ == 1]

In [None]:
print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(min_features_to_select,
               len(rfecv.grid_scores_) + min_features_to_select),
         rfecv.grid_scores_)
plt.show()

In [None]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import plot_precision_recall_curve
import matplotlib.pyplot as plt

df_type_1_features['dataset_cat'] = pd.Series([_.split("_")[0] for _ in df_type_1_features['dataset']], dtype='category')
cv = PredefinedSplit(df_type_1_features['dataset_cat'].cat.codes)
classifier = rfc

categories = df_type_1_features['dataset_cat'].cat.categories

X = X_type_1.loc[:, X_type_1.columns.difference(["assembly_accession", "dataset", "dataset_cat", "truth"])]
X = rfecv.transform(X)

precisions = []
average_precisions = []
mean_recall = np.linspace(0, 1, 100)
classifiers = []

for i, (train, test) in enumerate(CV.split(X, y)):
    clf = RandomForestClassifier(n_estimators=100, max_features=.2, min_samples_leaf=17, min_samples_split=9, bootstrap=False, criterion="gini")
    clf.fit(X[train], y[train])
    classifiers.append(clf)

In [None]:
from sklearn.metrics import f1_score

In [None]:
X

In [None]:
test.shape

In [None]:
fig, ax = plt.subplots()
for i, ((train, test), classifier) in enumerate(zip(CV.split(X, y), classifiers)):
    viz = plot_precision_recall_curve(classifier, X[test], y[test],
                     name=f'{categories[i]}',
                     alpha=0.3, lw=1, ax=ax)
    interp_precision = np.interp(mean_recall, viz.recall[::-1], viz.precision[::-1])
    interp_precision[0] = 1.0
    precisions.append(interp_precision)
    average_precisions.append(viz.average_precision)
    y_pred = classifier.predict(X[test])
    print(f1_score(y[test], y_pred))
    

mean_precision = np.mean(precisions, axis=0)
mean_precision[-1] = 0.0
mean_average_precisions = np.mean(average_precisions)
std_average_precisions = np.std(average_precisions)

ax.plot(mean_recall, mean_precision, color='b',
        label=r'Mean PR (AP = %0.2f $\pm$ %0.2f)' % (mean_average_precisions, std_average_precisions),
        lw=2, alpha=.8)

# calculate the no skill line as the proportion of the positive class
no_skill = len(y[y==True]) / len(y)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label=f'No Skill ( AP = {no_skill:.5f})')

std_precisions = np.std(precisions, axis=0)
precisions_upper = np.minimum(mean_precision + std_precisions, 1)
precisions_lower = np.maximum(mean_precision - std_precisions, 0)
ax.fill_between(mean_recall, precisions_lower, precisions_upper, color='grey', alpha=.2,
                label=r'$\pm$ 1 std. dev.')

ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Precision Recall Curves")
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()

In [None]:
# #import the required library
# from numpy import argmax

# # Calculate F-Scores and find the index of ideal score
# fscore = (2 * mean_precision * mean_recall) / (mean_precision + mean_recall)
# ix = argmax(fscore)
# best_thresh = thresholds[ix]
# print('Best Threshold: %f' % (best_thresh))