In [185]:
import pandas as pd
import re
import numpy as np

from Bio import SeqIO

In [186]:
def get_proteins_for_embedding(data_path):
    """Parse protein sequences from fasta file and process it in specific way with func

    Arguments:
    
    data_path - path where fasta file is
    func - process func for sequences(extract 3gams i.e)
    """

    dict_of_sequences = {}
    for seq_record in SeqIO.parse(data_path, "fasta"):
        mhc = "HLA" + re.sub('[*|:|-]', '', seq_record.description[13:21]).strip()
        dict_of_sequences[mhc] = str(seq_record.seq)
    
    return pd.DataFrame({"mhc": list(dict_of_sequences.keys()), "sequence": list(dict_of_sequences.values())})

In [187]:
A_gen = get_proteins_for_embedding("/home/rude_mhc/IMGTHLA/fasta/A_prot.fasta")
B_gen = get_proteins_for_embedding("/home/rude_mhc/IMGTHLA/fasta/B_prot.fasta")
C_gen = get_proteins_for_embedding("/home/rude_mhc/IMGTHLA/fasta/C_prot.fasta")
E_gen = get_proteins_for_embedding("/home/rude_mhc/IMGTHLA/fasta/E_prot.fasta")
bdata = pd.read_csv("/home/rude_mhc/mhc/bdata.csv")

In [188]:
ABCE_gen = pd.concat([A_gen, B_gen, C_gen, E_gen], axis = 0).reset_index(drop=True)
common = set(bdata.mhc.unique()).intersection(ABCE_gen.mhc.unique())

In [189]:
common_ABCE = ABCE_gen.iloc[np.array(np.where(ABCE_gen.mhc.apply(lambda x: x in common))).flatten()].reset_index(drop=True)

In [191]:
common_ABCE.to_csv("mhc_seq_imghtla.csv", index=False)