# Dev-work: labeling

In [2]:
# system dependecies
import os
from pathlib import Path
import time
import pickle

from joblib import Parallel, delayed

import csv
from collections import defaultdict
from typing import Dict, List, Tuple


# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns
from collections import defaultdict

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SearchIO

## pyhmmer
import pyhmmer

# local dependencies/utils

## Paths
PFAM_PATH = Path("/Users/humoodalanzi/pfam/Pfam-A.hmm")
ID_DB_PATH = Path("/Users/humoodalanzi/pfam/proteins_id.zip")
#probably need path of unit tests

Some exposition to clarify goals:

Purpose:
 
if two proteins share some amount of similarity on identified domains.
E.g. if a protein is labeled with 4 domains, and three are shared with the other protein which has 5 domains, then the two proteins have Jaccard similarity of 3/9 = 0.33. Some threshold is set for this to be validated as a protein pair.

* Calculate the Jaccard similarity between the domains of each pair of proteins. In accomplishing this step, you may need to save the HMMER domains to file, and then load and compute the Jaccard similarity in a separate step.
Otherwise, do both at runtime
* Save the results, including protein IDs in parquet files at `./data/taxa_pairs/hmmer_val` of the form `"taxa_pair_XX-YY.parquet"`, which are 1:1 mirrors of the blast files. Each parquet file should contain the domains IDs identified for each protein
and the calculated Jaccard score.

In [None]:
def find_jaccard_similarity(set1, set2):
    """_summary_
    """
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [None]:
def process_parsed_protein_pairs():
    """_summary_
    """
    "TODO"
    pass

## function stubs

These should be ordered:

In [None]:
def read_csv(file_path: str) -> Dict[str, List[str]]:
    """
    Reads a CSV file and returns a dictionary with query IDs as keys and a list
    of accession IDs as values.
    """
    # TODO: implement this function
    pass

In [None]:
# stolen from Evan
def find_jaccard_similarity(set1, set2):
    """
    Calculates the Jaccard similarity score between two sets.
    """
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [None]:
def calculate_similarity(file1: str, file2: str, threshold: float) -> Dict[str, bool]:
    """
    Calculates the Jaccard similarity score between each protein in file1 and file2,
    and returns a dictionary with query IDs as keys and a boolean indicating whether
    the score threshold was met or not.
    """
    # Read the CSV files and create dictionaries with query IDs and accession IDs
    dict1 = read_csv(file1)
    dict2 = read_csv(file2)
    
    # Create a dictionary to store the Jaccard similarity scores
    scores = defaultdict(float)
    
    # Calculate the Jaccard similarity score between each protein in file1 and file2
    for query1, accs1 in dict1.items():
        for query2, accs2 in dict2.items():
            score = find_jaccard_similarity(set(accs1), set(accs2))
            scores[(query1, query2)] = score
    
    # Create a dictionary to store the functional boolean values
    functional = {}
    
    # Set the functional boolean value based on the Jaccard similarity score threshold
    for (query1, query2), score in scores.items():
        if score >= threshold:
            functional[(query1, query2)] = True
        else:
            functional[(query1, query2)] = False
    
    return functional

In [None]:
def calculate_similarity(file1: str, file2: str, threshold: float, n_jobs: int = -1) -> Dict[str, bool]:
    """
    Calculates the Jaccard similarity score between each protein in file1 and file2,
    and returns a dictionary with query IDs as keys and a boolean indicating whether
    the score threshold was met or not.

    This function uses joblib to parallelize the comparison of proteins between the
    two files.
    """
    # Read the CSV files and create dictionaries with query IDs and accession IDs
    dict1 = read_csv(file1)
    dict2 = read_csv(file2)

    # Define a function to calculate the Jaccard similarity score between two proteins
    def calculate_score(query1: str, accs1: List[str], query2: str, accs2: List[str]) -> Tuple[str, bool]:
        score = find_jaccard_similarity(set(accs1), set(accs2))
        functional = score >= threshold
        return f'{query1}|{query2}', functional

    # Parallelize the comparison of proteins between the two files
    scores = Parallel(n_jobs=n_jobs)(
        delayed(calculate_score)(query1, accs1, query2, accs2)
        for query1, accs1 in dict1.items()
        for query2, accs2 in dict2.items()
    )

    # Create a dictionary to store the functional boolean values
    functional = {}
    for query_pair, score in scores:
        functional[query_pair] = score

    return functional


In [None]:
def write_function_output(output_file: str, functional: Dict[str, bool]):
    """
    Writes the output to a CSV file with two columns: query ID and a functional boolean.
    """
    # TODO: implement this function
    pass

In [None]:
if __name__ == '__main__':
    # Set the input file paths
    file1 = 'file1.csv'
    file2 = 'file2.csv'
    
    # Set the Jaccard similarity score threshold
    threshold = 0.33
    
    # Calculate the Jaccard similarity score and functional boolean values
    functional = calculate_similarity(file1, file2, threshold)
    
    # Write the output to a CSV file
    output_file = 'output.csv'
    write_function_output(output_file, functional)

## Development of stubs

In [8]:
meso_output = os.path.abspath(os.path.join('..', 'scripts', 'results', 'meso_result_0.csv'))
thermo_output = os.path.abspath(os.path.join('..', 'scripts', 'results', 'thermo_result_0.csv'))
# they work!

In [9]:
def parse_function_csv(file_path: str) -> Dict[str, List[str]]:
    """
    Reads a CSV file and returns a dictionary with query IDs as keys and a list
    of accession IDs as values.

    Parameters
    ----------
    file_path : str
        _description_

    Returns
    -------
    Dict[str, List[str]]
        _description_
    """
    # Create a dictionary to store csv results
    protein_dict = {}

    with open(file_path, 'r') as csvfile:
        # read csv
        reader = csv.reader(csvfile)
        # skip header
        next(reader)
        for row in reader:
            query_id = row[0]
            accessions = row[1].split(';')
            protein_dict[query_id] = accessions
    return protein_dict

In [10]:
parse_function_csv(meso_output)

{'10206': ['PF13561.9', 'PF00106.28', 'PF08659.13'],
 '10926': ['PF00589.25'],
 '1105': ['PF00005.30', 'PF02463.22', 'PF13304.9', 'PF13555.9'],
 '11314': ['PF00005.30', 'PF13304.9', 'PF13604.9'],
 '11673': ['PF00005.30',
  'PF13304.9',
  'PF13732.9',
  'PF00004.32',
  'PF13479.9',
  'PF13175.9'],
 '11706': ['PF02653.19'],
 '12230': ['PF13377.9', 'PF00532.24', 'PF13407.9', 'PF00356.24'],
 '12456': ['PF00005.30', 'PF13304.9', 'PF13476.9', 'PF13555.9'],
 '12570': ['PF01339.20', 'PF00072.27'],
 '12753': ['PF01174.22', 'PF07685.17'],
 '1284': ['PF00574.26'],
 '12897': ['PF04542.17', 'PF08281.15', 'PF04545.19'],
 '1294': ['PF00005.30', 'PF13304.9', 'PF13671.9', 'PF13191.9', 'PF13401.9'],
 '12966': ['PF00155.24'],
 '13026': ['PF00072.27', 'PF00486.31'],
 '13050': ['PF00005.30', 'PF13304.9'],
 '13198': ['PF01144.26'],
 '13524': ['PF00106.28', 'PF13561.9', 'PF08659.13'],
 '13575': ['PF00005.30', 'PF13304.9', 'PF13555.9'],
 '13620': ['PF13561.9', 'PF00106.28', 'PF08659.13'],
 '13633': ['PF13561.

Cool!

In [11]:
def find_jaccard_similarity(set1, set2):
    """
    Calculates the Jaccard similarity score between two sets.
    """
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

In [12]:
def calculate_similarity(file1: str, file2: str, threshold: float) -> Dict[str, bool]:
    """
    Calculates the Jaccard similarity score between each protein in file1 and file2,
    and returns a dictionary with query IDs as keys and a boolean indicating whether
    the score threshold was met or not.
    """
    # Read the CSV files and create dictionaries with query IDs and accession IDs
    dict1 = parse_function_csv(file1)
    dict2 = parse_function_csv(file2)
    
    # Create a dictionary to store the Jaccard similarity scores
    scores = defaultdict(float)
    
    # Calculate the Jaccard similarity score between each protein in file1 and file2
    for query1, accs1 in dict1.items():
        for query2, accs2 in dict2.items():
            score = find_jaccard_similarity(set(accs1), set(accs2))
            scores[(query1, query2)] = score
    
    # Create a dictionary to store the functional boolean values
    functional = {}
    
    # Set the functional boolean value based on the Jaccard similarity score threshold
    for (query1, query2), score in scores.items():
        if score >= threshold:
            functional[(query1, query2)] = True
        else:
            functional[(query1, query2)] = False
    
    return functional

In [None]:
def write_function_output(output: Dict[str, bool], output_file: str):
    """
    Writes a dictionary of protein query IDs and functional boolean values to a CSV file.

    Parameters
    ----------
    output : Dict[str, bool]
        A dictionary of protein query IDs and functional boolean values
    output_file : str
        File path to write the output CSV file
    """
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # write header
        writer.writerow(['query_id', 'functional'])
        for query_id, functional in output.items():
            writer.writerow([query_id, functional])