In [None]:
# install hmmer library
!sudo apt-get install hmmer
!sudo apt-get install hmmer-doc
!sudo apt-get install ncbi-blast+

In [15]:
import pandas as pd

## Clean csv file

In [26]:
def clean_csv_file(path: str) -> str or None:
    """
    Reads and cleans a CSV file, providing options to return the cleaned data as a string or save it into a file.

    Parameters:
        path (str): The path to the CSV file to be cleaned.

    Returns:
        str or None: If the user chooses to get the results as a variable (1),
        the cleaned data is returned as a string. If the user chooses to save the
        results into a file (2), the cleaned data is saved into a file.

    Usage:
        1. `path` should be the path to the CSV file that needs to be cleaned.
        2. The function interactively prompts the user to choose between getting
           the results as a variable or saving them into a file.
        3. If the user selects to save the results into a file, they are further
           prompted to choose between saving only keys or in Fasta format.
        4. If the user selects to save in Fasta format, the data is saved in a
           '.fasta' file with each entry represented as a Fasta sequence.
        5. If the user selects to save only keys, the data is saved in a '.txt'
           file with each key on a separate line.

    Notes:
        - If the user does not provide a valid input for any prompt, they are
          repeatedly prompted until a valid input is provided.
        - If the user does not provide a file name when prompted for the output
          file name, a default name "output_seq" is used.
        - The function utilizes the Pandas library to read and manipulate CSV data.
        - The function utilizes Python's built-in file handling capabilities to
          save the data into text files.
    """
    break_line = '\n------------------------------\n'
    print('Reading the file...')
    df = pd.read_csv(path)
    print('Cleaning the CSV file...')
    df = df.dropna(subset=['Entity ID'])
    df = df.drop(columns=['Unnamed: 3'])
    df['Entity ID'] = df['Entity ID'].str.split('_').str[0] + ':' + df['Auth Asym ID']
    df = df.drop(columns=['Auth Asym ID'])
    df = df.reset_index(drop=True)
    print('Done!', break_line)

    while True:
        which_output = input('Do you want to get the results as variable(1) or file(2)?\nOnly enter the corresponding number[1, 2]: ')
        if which_output in ['1', '2']:
            which_output = int(which_output)
            break
        else:
            print('Invalid input. Please enter either 1 or 2.' + break_line)

    # Save the output into a variable
    if which_output == 1:
        return '\n'.join(df['Entity ID'].values)

    # Save the output into a file
    elif which_output == 2:
        print(break_line)
        output_file_name = input('Enter your output file name without extension (Press Enter for default "output_seq"): ')
        output_file_name = output_file_name.strip()
        if output_file_name == "":
            output_file_name = "output_seq"

        while True:
            with_fasta = input('[1] Only keys \\ [2] As Fasta format: ')
            if with_fasta in ['1', '2']:
                with_fasta = int(with_fasta)
                break
            else:
                print('Invalid input. Please enter either 1 or 2.' + break_line)
        # Save as .fasta
        if with_fasta == 2:
            with open(output_file_name + '.fasta', 'w') as file:
                for idx, row in df.iterrows():
                    file.write(f"> {row['Entity ID']}\n{row['Sequence']}\n")
            print(break_line, 'Data saved to', output_file_name + '.fasta')
        # Save as .keys
        elif with_fasta == 1:
            with open(output_file_name + '.txt', 'w') as f:
                f.write('\n'.join(df['Entity ID'].values))
            print(break_line, 'Data saved to', output_file_name + '.keys')


In [27]:
!wget -O pdb_report.csv "https://github.com/heispv/bioinformatics/raw/master/lab-of-bioinformatics/rcsb_pdb_custom_report_20240411062134.csv"

--2024-05-07 14:44:49--  https://github.com/heispv/bioinformatics/raw/master/lab-of-bioinformatics/rcsb_pdb_custom_report_20240411062134.csv
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/heispv/bioinformatics/master/lab-of-bioinformatics/rcsb_pdb_custom_report_20240411062134.csv [following]
--2024-05-07 14:44:49--  https://raw.githubusercontent.com/heispv/bioinformatics/master/lab-of-bioinformatics/rcsb_pdb_custom_report_20240411062134.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1611 (1.6K) [text/plain]
Saving to: ‘pdb_report.csv’


2024-05-07 14:44:49 (15.8 MB/s) - ‘pdb_report.csv’ sa

In [28]:
path = "/content/pdb_report.csv"

In [30]:
# for default -> select (2 / default / 2)
clean_csv_file(path)

Reading the file...
Cleaning the CSV file...
Done! 
------------------------------

Do you want to get the results as variable(1) or file(2)?
Only enter the corresponding number[1, 2]: 2

------------------------------

Enter your output file name without extension (Press Enter for default "output_seq"): 
[1] Only keys \ [2] As Fasta format: 2

------------------------------
 Data saved to output_seq.fasta


In [31]:
!cat output_seq.fasta | head -n 30

> 1AAP:A
VREVCSEQAETGPCRAMISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCGSA
> 1BUN:B
RKRHPDCDKPPDTKICQTVVRAFYYKPSAKRCVQFRYGGCNGNGNHFKSDHLCRCECLEYR
> 1DTX:A
QPRRKLCILHRNPGRCYDKIPAFYYNQKKKQCERFDWSGCGGNSNRFKTIEECRRTCIG
> 1FAK:I
APDFCLEPPYDGPCRALHLRYFYNAKAGLCQTFYYGGCLAKRNNFESAEDCMRTC
> 1KTH:A
ETDICKLPKDEGTCRDFILKWYYDPNTKSCARFWYGGCGGNENKFGSQKECEKVCAPV
> 1ZR0:B
PTGNNAEICLLPLDYGPCRALLLRYYYDRYTQSCRQFLYGGCEGNANNFYTWEACDDACWRIE
> 3BYB:A
KDRPDFCELPADTGPCRVRFPSFYYNPDEKKCLEFIYGGCEGNANNFITKEECESTCAA
> 3M7Q:B
EAEASICSEPKKVGRCKGYFPRFYFDSETGKCTPFIYGGCGGNGNNFETLHQCRAICRALG
> 4BQD:A
DSEEDEEHTIITDTELPPLKLMHSFCAFKADDGPCKAIMKRFFFNIFTRQCEEFIYGGCEGNQNRFESLEECKKMCTRD
> 4DTG:K
QEKPDFCFLEEDPGICRGYITRYFYNNQTKQCERFKYGGCLGNMNNFETLEECKNICEDGHHHHHH
> 4ISO:B
QTEDYCLASNKVGRCRGSFPRWYYDPTEQICKSFVYGGCLGNKNNYLREEECILACRGVQ
> 4NTW:B
QIRPAFCYEDPPFFQKCGAFVDSYYFNRSRITCVHFFYGQCDVNQNHFTTMSECNRVCHG
> 4U30:X
TVAACANLPIVRGPCRAFIQLWAFDAVKGKCVLFPYGGCQGNGNKFYSEKECREYCGVP
> 4U32:X
IHDFCLVSKVVGRCRASMPRWWYNVTDGSCQLFVYGGCDGNSNNYLTKEECLKKC
> 5M4V:

## Get MSA

In [32]:
# Getting the multiple sequence alignment is done in the PDBe website
!wget -O fasta.txt "https://github.com/heispv/bioinformatics/raw/master/lab-of-bioinformatics/fasta.txt"

--2024-05-07 14:48:31--  https://github.com/heispv/bioinformatics/raw/master/lab-of-bioinformatics/fasta.txt
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/heispv/bioinformatics/master/lab-of-bioinformatics/fasta.txt [following]
--2024-05-07 14:48:31--  https://raw.githubusercontent.com/heispv/bioinformatics/master/lab-of-bioinformatics/fasta.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2888 (2.8K) [text/plain]
Saving to: ‘fasta.txt’


2024-05-07 14:48:31 (23.2 MB/s) - ‘fasta.txt’ saved [2888/2888]



### Build HMM based on the raw MSA

In [54]:
# Create an HMM model based on the fasta.txt file
!hmmbuild msa_notclean.hmm fasta.txt

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             fasta.txt
# output HMM file:                  msa_notclean.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     fasta                   20    79    59     2.32  0.946 

# CPU time: 0.03u 0.00s 00:00:00.03 Elapsed: 00:00:00.04


In [59]:
!cat msa_notclean.hmm | head -n 30

HMMER3/f [3.3.2 | Nov 2020]
NAME  fasta
LENG  59
ALPH  amino
RF    no
MM    no
CONS  yes
CS    no
MAP   yes
DATE  Tue May  7 15:03:18 2024
NSEQ  20
EFFN  2.319336
CKSUM 1296990217
STATS LOCAL MSV       -9.1071  0.71896
STATS LOCAL VITERBI   -9.4408  0.71896
STATS LOCAL FORWARD   -4.0823  0.71896
HMM          A        C        D        E        F        G        H        I        K        L        M        N        P        Q        R        S        T        V        W        Y   
            m->m     m->i     m->d     i->m     i->i     d->m     d->d
  COMPO   2.71462  2.61835  3.05921  2.65138  2.83864  2.72225  3.80116  3.43208  2.58786  3.03883  4.06383  2.72587  3.50073  3.03111  2.84719  2.76261  2.91405  3.19865  4.56720  2.92438
          2.68653  4.42260  2.77518  2.73006  3.46389  2.40534  3.72482  3.29276  2.67747  2.69356  4.24724  2.90381  2.73721  3.18181  2.89808  2.37909  2.77480  2.98553  4.58391  3.61538
          0.80054  1.11950  1.49401  1.18203  0.36623  0.00000   

Based on the file above, we can observe that the `hmmbuild` command, applied to the `fasta.txt` file, cuts the first 20 characters in the sequence. This action is taken because there are not enough amino acids to build the Hidden Markov Model (HMM) for that part of the sequence. Therefore, we will trim the first 20 characters of each sequence and then reapply the `hmmbuild` command.

## Clean raw MSA

In [61]:
def clean_fasta(path: str, first_clipping_num, output_file_name: str) -> None:
    """
    Clean FASTA file by removing specified number of characters from the beginning of each sequence.

    Args:
        path (str): Path to the input FASTA file.
        first_clipping_num (int): Number of characters to remove from the beginning of each sequence.
        output_file_name (str): Name of the output file.

    Returns:
        None

    This function reads a FASTA file, extracts the sequence IDs and sequences, removes the specified
    number of characters from the beginning of each sequence, and writes the cleaned sequences to a new file.
    """
    with open(path) as f:
        fastas = f.read().split('\n\n')

    clean_list = []
    for fasta in fastas:
        id = fasta.split()[0]
        sequence = ''.join(fasta.split('\n')[1:])
        clean_list.append((id, sequence))

    with open(output_file_name + '.txt', 'w') as f:
        for item in clean_list:
            f.write(f"{item[0]}\n{item[1][first_clipping_num-1:]}\n")

    print(f'Output saved in {output_file_name}.txt')

In [66]:
clean_fasta("/content/fasta.txt", 20, "clean_fasta")

Output saved in clean_fasta.txt


In [63]:
# Check the fasta file before the cleaning
!cat fasta.txt | head -n 20

>PDB:1aap:A X-RAY CRYSTAL STRUCTURE OF THE PROTEASE INHIBITOR 
--------------------vrevcseqaetgpcrAmISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEY
CMAVCg---

>PDB:1bun:B STRUCTURE OF BETA2-BUNGAROTOXIN: POTASSIUM CHANNEL
------------------rkrhpdcdkppdtkicqTvVRAFYYKPSAKRCVQFRYGGCNGNGNHFKSDHL
CRCECleyr

>PDB:1dtx:A CRYSTAL STRUCTURE OF ALPHA-DENDROTOXIN FROM THE GR
------------------eprrklcilhrnpgrcyDkIPAFYYNQKKKQCERFDWSGCGGNSNRFKTIEE
CRRTCig--

>PDB:1fak:I HUMAN TISSUE FACTOR COMPLEXED WITH COAGULATION FAC
--------------------apdfcleppydgpcrAlHLRYFYNAKAGLCQTFYYGGCLAKRNNFESAED
CMRTC----

>PDB:1kth:A THE ANISOTROPIC REFINEMENT OF KUNITZ TYPE DOMAIN C
-------------------etdicklpkdegtcrdF-ILKWYYDPNTKSCARFWYGGCGGNENKFGSQKE
CEKVCapv-



In [64]:
# Check the clean_fasta.txt file
!cat clean_fasta.txt | head -n 20

>PDB:1aap:A
-vrevcseqaetgpcrAmISRWYFDVTEGKCAPFFYGGCGGNRNNFDTEEYCMAVCg---
>PDB:1bun:B
krhpdcdkppdtkicqTvVRAFYYKPSAKRCVQFRYGGCNGNGNHFKSDHLCRCECleyr
>PDB:1dtx:A
prrklcilhrnpgrcyDkIPAFYYNQKKKQCERFDWSGCGGNSNRFKTIEECRRTCig--
>PDB:1fak:I
-apdfcleppydgpcrAlHLRYFYNAKAGLCQTFYYGGCLAKRNNFESAEDCMRTC----
>PDB:1kth:A
etdicklpkdegtcrdF-ILKWYYDPNTKSCARFWYGGCGGNENKFGSQKECEKVCapv-
>PDB:1zr0:B
nnaeicllpldygpcrAlLLRYYYDRYTQSCRQFLYGGCEGNANNFYTWEACDDACwrie
>PDB:3byb:A
drpdfcelpadtgpcrVrFPSFYYNPDEKKCLEFIYGGCEGNANNFITKEECESTCa---
>PDB:3m7q:B
aeasicsepkkvgrckGyFPRFYFDSETGKCTPFIYGGCGGNGNNFETLHQCRAICralg
>PDB:4bqd:A
lmhsfcafkaddgpckAiMKRFFFNIFTRQCEEFIYGGCEGNQNRFESLEECKKMCtrd-
>PDB:4dtg:K
ekpdfcfleedpgicrGyITRYFYNNQTKQCERFKYGGCLGNMNNFETLEECKNICedgh


### Build HMM based on clean MSA

In [68]:
# Create an HMM model based on the clean_fasta.txt file
!hmmbuild msa.hmm clean_fasta.txt

# hmmbuild :: profile HMM construction from multiple sequence alignments
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# input alignment file:             clean_fasta.txt
# output HMM file:                  msa.hmm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# idx name                  nseq  alen  mlen eff_nseq re/pos description
#---- -------------------- ----- ----- ----- -------- ------ -----------
1     clean_fasta             20    60    59     2.32  0.946 

# CPU time: 0.03u 0.00s 00:00:00.03 Elapsed: 00:00:00.04


In [69]:
!cat msa.hmm | head -n 30

HMMER3/f [3.3.2 | Nov 2020]
NAME  clean_fasta
LENG  59
ALPH  amino
RF    no
MM    no
CONS  yes
CS    no
MAP   yes
DATE  Tue May  7 15:13:33 2024
NSEQ  20
EFFN  2.319336
CKSUM 3245897423
STATS LOCAL MSV       -9.1071  0.71896
STATS LOCAL VITERBI   -9.4408  0.71896
STATS LOCAL FORWARD   -4.0823  0.71896
HMM          A        C        D        E        F        G        H        I        K        L        M        N        P        Q        R        S        T        V        W        Y   
            m->m     m->i     m->d     i->m     i->i     d->m     d->d
  COMPO   2.71513  2.60360  3.06507  2.65003  2.83041  2.72890  3.80257  3.43475  2.58634  3.04619  4.06086  2.72298  3.52132  3.02863  2.84631  2.77096  2.91671  3.20289  4.56690  2.91555
          2.68618  4.42225  2.77519  2.73123  3.46354  2.40513  3.72494  3.29354  2.67741  2.69355  4.24690  2.90347  2.73739  3.18146  2.89801  2.37887  2.77519  2.98518  4.58477  3.61503
          0.26564  4.73156  1.49401  0.61958  0.77255  0.00

* In this new file, we can observe that the probabilities start from the first amino acid (AA), indicating that no cutting is performed by the `hmmbuild` command itself.

## Get the negative and postive data from NCBI

In [71]:
!wget -O negative.fasta.gz "https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28reviewed%3Atrue%29+NOT+%28xref%3Apfam-PF00014%29%29"
!zcat -f negative.fasta.gz > negative.fasta
!rm negative.fasta.gz

--2024-05-07 15:17:04--  https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28reviewed%3Atrue%29+NOT+%28xref%3Apfam-PF00014%29%29
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 
Length: unspecified [text/plain]
Saving to: ‘negative.fasta.gz’

negative.fasta.gz       [    <=>             ] 134.34M   422KB/s    in 5m 13s  

2024-05-07 15:22:18 (440 KB/s) - ‘negative.fasta.gz’ saved [140864598]



In [72]:
!wget -O bpti_reviewd.fasta.gz "https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28xref%3Apfam-PF00014%29+AND+%28reviewed%3Atrue%29%29"
!zcat bpti_reviewd.fasta.gz > bpti_reviewd.fasta
!rm bpti_reviewd.fasta.gz

--2024-05-07 15:25:57--  https://rest.uniprot.org/uniprotkb/stream?compressed=true&format=fasta&query=%28%28xref%3Apfam-PF00014%29+AND+%28reviewed%3Atrue%29%29
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 
Length: unspecified [text/plain]
Saving to: ‘bpti_reviewd.fasta.gz’

bpti_reviewd.fasta.     [ <=>                ]  41.28K  --.-KB/s    in 0.08s   

2024-05-07 15:25:57 (509 KB/s) - ‘bpti_reviewd.fasta.gz’ saved [42270]



In [76]:
# Make blast dataset for the bpti_reviewd.fasta
!makeblastdb -in bpti_reviewd.fasta -dbtype prot



Building a new DB, current time: 05/07/2024 15:27:30
New DB name:   /content/bpti_reviewd.fasta
New DB title:  bpti_reviewd.fasta
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 391 sequences in 0.00923395 seconds.


