In [1]:
import os
import yaml
from yaml.loader import SafeLoader
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from yellowbrick.target import FeatureCorrelation
from yellowbrick.target import ClassBalance
from yellowbrick.features import Rank1D, Rank2D
from yellowbrick.features import Manifold
from yellowbrick.features import JointPlotVisualizer
from yellowbrick.features import PCA
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ClassificationReport
from yellowbrick.classifier import PrecisionRecallCurve
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.model_selection import RFECV


from tests.eda import plot_prop_x_prop
from src.datamanager.dataset_manager import FeaturesManager

np.random.seed(171)

In [2]:
pos_fasta = '../data/raw-data/fasta/Bacillus_pos.fa'
neg_fasta = '../data/raw-data/fasta/Bacillus_neg.fa'

with open(os.path.join('..', 'params', 'eda-nucs-config.yaml')) as f:
    args_data = yaml.load(f, Loader=SafeLoader)
    print(args_data)
    print(args_data['features'][0])

dm = FeaturesManager(fasta_paths=(pos_fasta, neg_fasta))
dm.transform_raw_dataset(params=args_data['features'])

{'experiment_name': 'EDA-NUC-K1', 'experiment_version': 'v0.0.1', 'experiment_type': 'GradientBoostingExp', 'cv': 5, 'n_samples': 1, 'debug': 1, 'dataset_organism': 'Bacillus', 'datasets': [{'class_name': 'promoter', 'path': 'data/raw-data/fasta/Bacillus_pos.fa'}, {'class_name': 'non-promoter', 'path': 'data/raw-data/fasta/Bacillus_neg.fa'}], 'features': [{'k': 1, 'encode': 'onehot', 'slice': [59, 20, 20]}], 'models_params': [{'model_name': 'GradientBoostingClassifier', 'module': 'sklearn.ensemble', 'params': {'n_estimators': 100, 'learning_rate': 0.1, 'max_depth': 1, 'random_state': 0}}]}
{'k': 1, 'encode': 'onehot', 'slice': [59, 20, 20]}
['AGCATAGTGACTACAATAAAGGGGATACCGAAAATTTCCTG', 'GATTGACGATTATTGGAAACCTTGTTATGCTATGAAGGTAA', 'GGAAACCTTGTTATGCTATGAAGGTAAGGATTTTGTCGAAT', 'CATGACAATGTTTAAATGGAAAAGTCAGATATTTTTCGGAG', 'TGAATATATACTATAATAATTGTGACAACTTCAGCAAAGGG', 'TTATAATTTGGTATTCTTAAAGAAGGCATGTATTTTTGATA', 'ATTTATGAGGTTATAGTGTAGTTATCAAGAATGCTAAACGG', 'CGTCAACATTCGATAAAATATAGAGAGATAAAAA

TypeError: Descriptors cannot not be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
print(dm.datasets[0].encoded_classes_datasets[0])

In [None]:
pos_data = pd.DataFrame(dm.datasets[0].encoded_classes_datasets[0])
neg_data = pd.DataFrame(dm.datasets[0].encoded_classes_datasets[1])
print(f'POS: {pos_data.shape} | NEG: {neg_data.shape}')

X = pd.DataFrame(np.vstack([pos_data, neg_data]))
y = np.zeros(len(X))
y[len(pos_data):] += 1
print(X)

In [None]:
print(X)