# OMA scripting
# ===============================

In [1]:
# system dependencies
import sys
import logging
import os
import time

# library dependencies
import click
import duckdb as ddb
import pandas as pd
import pyhmmer
import joblib
from joblib import Parallel, delayed
from sklearn.utils import resample
from tqdm import tqdm


# local dependencies
import pairpro.utils as pp_utils
# blast
import pairpro.user_blast as pp_up
# hmmer
import pairpro.hmmer as pp_hmmer
# structure
import pairpro.structures as pp_structures
# ML
from pairpro.train_val_wrapper import train_val_wrapper

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [4]:
####################
### PATHS & VARS ###
####################
# db Paths
TEST_DB_PATH = '../tmp/oma.db' 

# BLAST Paths
BLAST_OUTPUT_DIR = '../data/protein_pairs/blast_output/'

# HMMER Paths
HMM_PATH = '../data/pfam/Pfam-A.hmm'  # ./Pfam-A.hmm
PRESS_PATH = '../data/pfam/pfam'
HMMER_OUTPUT_DIR = '../data/protein_pairs/'
PARSE_HMMER_OUTPUT_DIR = '../data/protein_pairs/parsed_hmmer_output/'
WORKER_WAKE_UP_TIME = 25  # this is to ensure that if a worker that is about to be shut down due to previous task completetion doesn't actually start running

# Structure Paths
STRUCTURE_DIR = '../data/structures/'
STRUCTURE_OUTPUT_DIR = '../data/protein_pairs/structures/'

# ML Paths
MODEL_PATH = '../data/models/'

In [3]:
##################
# Aux. functions #
##################

# Logan edit of combined dataframe function (need to change function call name in main script):

def balance_data(dataframe, target_columns):
    """
    Resamples the dataframe to evenly distribute labels

    Args:
        dataframe (pandas dataframe): training dataframe
        label_columns (list): list of columns to sample from

    Returns:
        pandas dataframe: New DF with evenly sampled labels
    """
    # Ensure target_columns is a list, even if it's a single column.
    if not isinstance(target_columns, list):
        target_columns = list(target_columns)

    for target in target_columns:
        # separate the majority and minority classes
        majority_class = dataframe[dataframe[target] == dataframe[target].value_counts().idxmax()]
        minority_class = dataframe[dataframe[target] == dataframe[target].value_counts().idxmin()]

        #create new dataframe with len(minority_class)
        n_samples = len(minority_class)
        undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

        # Combine the undersampled majority class with the minority class
        dataframe = pd.concat([undersampled_majority, minority_class])
        print(f'DF length reduced to {dataframe.shape}')
        print(f'{target} value counts: {dataframe[target].value_counts()}')
        
    return dataframe

## Actual script w/o click stuff

In [5]:
##### database construction #####

con = ddb.connect(TEST_DB_PATH, read_only=False) # create a database. Has to be read_only=False

# create main table
con.execute("""CREATE OR REPLACE TABLE OMA_main AS 
            (
            SELECT query_id, subject_id, pair_id, query, subject 
            FROM
            (
                SELECT protein1_uniprot_id AS query_id, protein2_uniprot_id AS subject_id, pair_id, protein1_sequence AS query, protein2_sequence AS subject
                FROM combined_pairs
            ) 
            );""")

con.commit() # commit the changes. Otherwise, the table will not be created.

# create a table for proteins in pairs
con.execute("""CREATE OR REPLACE TABLE processed_proteins AS 
    (
        SELECT DISTINCT pid, protein_seq
        FROM 
        (
            SELECT protein1_uniprot_id AS pid, protein2_sequence as protein_seq
            FROM combined_pairs
            UNION ALL
            SELECT protein2_uniprot_id AS pid, protein2_sequence as protein_seq
            FROM combined_pairs
        )   
    );""")

con.commit() # commit the changes. Otherwise, the table will not be created.

<duckdb.DuckDBPyConnection at 0x7f136c914c30>

**Quick comment**:
This works as exepcted. Let's assume. We want to BLAST via Click. We will think about synergy b/w modules later, i.e., ml_feature_list as a way to keep track of choices.

### BLAST

In [11]:
print('Starting to run BLAST')
dataframe_for_blast = con.execute("SELECT * FROM OMA_main LIMIT 20").df()
print(f"DataFrame shape before BLAST processing: {dataframe_for_blast.shape}")

# run blast
s_time = time.time()
print('Starting to run BLAST')
blast_df = pp_up.blast_pairs(dataframe_for_blast, cpus=4)
print(f'BLAST completed in {time.time()-s_time} seconds')

# save blast results to csv
blast_df.to_csv(f'{BLAST_OUTPUT_DIR}blast_output.csv', index=False)

Starting to run BLAST
DataFrame shape before BLAST processing: (20, 5)
Starting to run BLAST
BLAST completed in 0.8106026649475098 seconds


In [10]:
print('Starting to run BLAST')
dataframe_for_blast = con.execute("SELECT * FROM OMA_main").df()
print(f"DataFrame shape before BLAST processing: {dataframe_for_blast.shape}")

# run blast
s_time = time.time()
print('Starting to run BLAST')
blast_df = pp_up.blast_pairs(dataframe_for_blast, cpus=4)
print(f'BLAST completed in {time.time()-s_time} seconds')

# save blast results to csv
blast_df.to_csv(f'{BLAST_OUTPUT_DIR}blast_output.csv', index=False)

Starting to run BLAST
DataFrame shape before BLAST processing: (402329, 5)
Starting to run BLAST
Found and skipped 178 invalid row(s) containing invalid amino acid sequences.


KeyboardInterrupt: 

**Notes**:

--

* Here we stop and check if we can manupliate duckdb correctly
* BLAST takes a bit now...

### HMMER

### Structure/FATCAT 2.0