# Dev work- attempting to work with interpro/pfam

In [9]:
# system dependecies
import subprocess


# library dependencies
import matplotlib.pyplot as plt
import numpy as np

import pandas as pd
import seaborn as sns

## biopython
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


# local dependencies/utils


In [2]:
# Will read API and work on this. HMMER website if interpro is slow.
# I need to install GNU make. All of this is done in my test_env which has HMMER and biopython

In [4]:
df = pd.read_csv("../data/Sample.csv", index_col=0)

In [5]:
df

Unnamed: 0,prot_pair_index,meso_seq,thermo_seq,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id,local_E_value,scaled_local_query_percent_id,local_gap_compressed_percent_id
1256842,126227630,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRKELESGRLELDPFDPAMLQPSSIDVRLDRFFRVFDNT...,27.5,45.0,0.777202,0.0,0.773196,0.802139
1456567,169784592,MRFEGTSGYVATDDLKVAVNAAIALERPLLVKGEPGTGKTVLAVEV...,MKFTGSDSYVATEDLMIAVNAAVTLERPLLVKGEPGTGKTELARQV...,30.0,54.0,0.782143,0.0,0.784946,0.784946
874464,31933768,MAYETINVDVQDHVCLIKLHRPEALNALNAALVSELCTALEEADAS...,MAYKTIIVEIEDHVALIKLNRPEALNALNSELLGELAQAVTEADAN...,19.5,54.0,0.775194,0.0,0.775194,0.775194
560201,32409414,MAIRKYKPTTPGRRGSSVADFAEITRSTPEKSLLRPLSKTGGRNNQ...,MGIRKYKPTTPGRRGASVADFVELTRREPEKSLLRPLPKKGGRNNR...,28.0,52.5,0.787770,0.0,0.790614,0.802198
33257,175862226,MLQRLQDRVAVVTGGGSGIGLATVRRFAAEGAKVVVADIDAAAGEA...,MSEDIICRRLTGRTAVVTGAGSGIGLASARRLASEGANVVCADVDE...,28.0,45.0,0.788350,0.0,0.780769,0.802372
...,...,...,...,...,...,...,...,...,...
458544,42061208,MRFVIARCQVDYVGRLTAHLPMANRLVMVKSDGSVLVHSDGGSYKP...,MRLVIARCQVDYVGRLTAHLPMAQRLLLIKADGSVSVHSDDRAYKP...,28.0,45.0,0.872146,0.0,0.872146,0.872146
469612,165123678,MKPIVGSIVALITPMHEDGSVDYPALRKLIDWHIAEGTDCIGVVGT...,MTSSRVTLTGSIVALVTPMHEDGSVDYPTLRKLIDWHIAQGTDCIS...,30.0,48.0,0.764805,0.0,0.758389,0.776632
226964,71308777,MSFFAPKTVVSAHCDLPCGVYDPAQARIEAESIKAVAEKYQANTDP...,MLSRLFAPTVEVSAHCDLPCGVYDPAQARIEAQSIKAIIEKYHASD...,30.0,52.5,0.777358,0.0,0.768657,0.804688
123183,137720058,MRLVIARCSVDYVGRLTAHLPMATRLLLVKADGSVSVHADDRAYKP...,MRLVIARCQVDYHGRLTAHLPMATRLVLIKADGSVSIHSDDRAYKP...,32.0,50.0,0.737557,0.0,0.744292,0.740909


In [25]:
# Define the amino acid sequence
meso_seq_list = [df['meso_seq'].iloc[i] for i in range(1001)] # loop-over 1000 meso sequences

# Create a list of SeqRecord objects
records = []
for i, seq in enumerate(meso_seq_list):
    record = SeqRecord(Seq(seq), id=f"meso_seq{i+1}")
    records.append(record)

# Write the list of SeqRecord objects to a FASTA file
SeqIO.write(records, "sequences.fasta", "fasta")

# Write sequence to file in FASTA format
with open("input.fasta", "w") as input_file:
    SeqIO.write(records, input_file, "fasta")

# Run HMMER search against the Pfam database using the hmmscan command
subprocess.run(["hmmscan", "--cpu", "4", "--domtblout", "output.domtblout", "/Users/humoodalanzi/pfam/Pfam-A.hmm", "input.fasta"])


# Parse the HMMER results file
best_hit = None
best_evalue = float("inf")
best_bitscore = float("-inf")
with open("output.domtblout", "r") as results_file:
    for line in results_file:
        if not line.startswith("#"):
            fields = line.strip().split()
            family_id = fields[0]
            evalue = float(fields[6])
            bitscore = float(fields[7])
            devalue = float(fields[11])
            start = int(fields[19])
            end = int(fields[20])
            if best_hit is None or evalue < best_hit["evalue"]:
                best_hit = {"family_id": family_id, "evalue": evalue,"devalue":devalue, "bitscore": bitscore, "start": start, "end": end}


# Print the best hit and its attributes
if best_hit is not None:
    print("Best Pfam family match: " + best_hit["family_id"])
    print("Total E-value: " + str(best_hit["evalue"]))
    print("Domain E-vlaue: " + str(best_hit["devalue"]))
    print("Bit score: " + str(best_hit["bitscore"]))
    print("Start position: " + str(best_hit["start"]))
    print("End position: " + str(best_hit["end"]))
else:
    print("No Pfam family match found.")

# hmmscan :: search sequence(s) against a profile database
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query sequence file:             input.fasta
# target HMM database:             /Users/humoodalanzi/pfam/Pfam-A.hmm
# per-dom hits tabular output:     output.domtblout
# number of worker threads:        4
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       meso_seq1  [L=192]
Description: <unknown description>
Scores for complete sequence (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Model    Description
    ------- ------ -----    ------- ------ -----   ---- --  -------- -----------
    2.8e-12   46.2   0.0      6e-07   28.7   0.0    2.1  2  DCD       2'-deoxycytidi