In [23]:
import os
import pandas as pd
import re

data_dir = '../data'
raw_data_file = os.path.join(data_dir, 'enzyme.dat')
dest = os.path.join(data_dir, 'processed')

if not os.path.exists(dest):
    os.makedirs(dest)

ID_and_uniprot = {}
descriptions = {}
ID = None  # Initialize ID as None

with open(raw_data_file, 'r') as f:
    for line in f:
        if line.startswith('ID'):
            ID = line[5:].replace('\n', '')
            ID_and_uniprot[ID] = []
            descriptions[ID] = {"DE": "", "AN": "", "CA": "", "CF": "", "CC": "", "PR": ""}

        if line.startswith('DR'):
            result_list = re.findall(r"\s\w+,", line)
            if result_list is not None: 
                result_list = "".join(result_list)
                result_list = re.sub(r" ", "", result_list, count = 0)
                result_list = result_list.split(",")
                for result in result_list:
                    if len(result) > 2: 
                        ID_and_uniprot[ID].append(result)
                result_list = list()
        
        for desc in ["DE", "AN", "CA", "CF", "CC", "PR"]:
            if line.startswith(desc) and ID:  # Ensure ID is not None before proceeding
                descriptions[ID][desc] += line[5:].replace('\n', '') + " "

IDs = []
ECs = []
DEs = []
ANs = []
CAs = []
CFs = []
CCs = []
PRs = []

for EC in ID_and_uniprot:
    proteins = ID_and_uniprot[EC]
    if len(proteins) < 1:
        pass
    else:
        for protein in proteins:
            ECs.append(EC)
            IDs.append(protein)
            DEs.append(descriptions[EC]["DE"])
            ANs.append(descriptions[EC]["AN"])
            CAs.append(descriptions[EC]["CA"])
            CFs.append(descriptions[EC]["CF"])
            CCs.append(descriptions[EC]["CC"])
            PRs.append(descriptions[EC]["PR"])

data = pd.DataFrame({
    'protein': IDs,
    'EC': ECs,
    'DE': DEs,
    'AN': ANs,
    'CA': CAs,
    'CF': CFs,
    'CC': CCs,
    'PR': PRs
})
data = data.drop_duplicates()
data.to_csv(f'{dest}/01_uniprotID_and_EC_raw.csv', index=None, sep=',')
data

Unnamed: 0,protein,EC,DE,AN,CA,CF,CC,PR
0,P07327,1.1.1.1,Alcohol dehydrogenase.,Aldehyde reductase.,(1) A primary alcohol + NAD(+) = an aldehyde +...,Zn(2+) or Fe cation.,-!- Acts on primary or secondary alcohols or h...,PROSITE; PDOC00058; PROSITE; PDOC00059; PROSIT...
1,P28469,1.1.1.1,Alcohol dehydrogenase.,Aldehyde reductase.,(1) A primary alcohol + NAD(+) = an aldehyde +...,Zn(2+) or Fe cation.,-!- Acts on primary or secondary alcohols or h...,PROSITE; PDOC00058; PROSITE; PDOC00059; PROSIT...
2,Q5RBP7,1.1.1.1,Alcohol dehydrogenase.,Aldehyde reductase.,(1) A primary alcohol + NAD(+) = an aldehyde +...,Zn(2+) or Fe cation.,-!- Acts on primary or secondary alcohols or h...,PROSITE; PDOC00058; PROSITE; PDOC00059; PROSIT...
3,P25405,1.1.1.1,Alcohol dehydrogenase.,Aldehyde reductase.,(1) A primary alcohol + NAD(+) = an aldehyde +...,Zn(2+) or Fe cation.,-!- Acts on primary or secondary alcohols or h...,PROSITE; PDOC00058; PROSITE; PDOC00059; PROSIT...
4,P25406,1.1.1.1,Alcohol dehydrogenase.,Aldehyde reductase.,(1) A primary alcohol + NAD(+) = an aldehyde +...,Zn(2+) or Fe cation.,-!- Acts on primary or secondary alcohols or h...,PROSITE; PDOC00058; PROSITE; PDOC00059; PROSIT...
...,...,...,...,...,...,...,...,...
251865,Q66EN1,7.6.2.15,ABC-type thiamine transporter.,Thiamine ABC transporter. Thiamine transportin...,ATP + H(2)O + thiamine-[thiamine-binding prote...,,-!- ATP-binding cassette (ABC) type transporte...,
251866,Q9H7F0,7.6.2.16,ABC-type putrescine transporter.,Putrescine ABC transporter. Putrescine transpo...,ATP + H(2)O + putrescine-[putrescine-binding p...,,-!- ATP-binding cassette (ABC) type transporte...,
251867,Q95JN5,7.6.2.16,ABC-type putrescine transporter.,Putrescine ABC transporter. Putrescine transpo...,ATP + H(2)O + putrescine-[putrescine-binding p...,,-!- ATP-binding cassette (ABC) type transporte...,
251868,Q5XF89,7.6.2.16,ABC-type putrescine transporter.,Putrescine ABC transporter. Putrescine transpo...,ATP + H(2)O + putrescine-[putrescine-binding p...,,-!- ATP-binding cassette (ABC) type transporte...,


In [21]:
from transformers import GPT2Tokenizer, GPT2Model

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2Model.from_pretrained('gpt2-large')

row = data.iloc[0]
text = '; '.join(f'{key}: {value}' for key, value in row.items())
output = model(**encoded_input)

In [22]:
output.last_hidden_state

tensor([[[ 0.3854, -0.1242, -0.5728,  ..., -0.2460, -0.0125, -1.8204],
         [ 0.2592,  0.9438, -0.2834,  ...,  0.0692,  0.2932, -0.4385],
         [-0.1204,  0.6497, -0.0799,  ...,  0.2835,  0.8455, -0.8077],
         ...,
         [ 0.6821,  0.4803, -1.1737,  ..., -1.1457,  0.1732,  0.2047],
         [ 0.2706, -0.8184, -0.6003,  ..., -0.3832,  0.1619, -0.3772],
         [-0.1422,  0.5846,  0.4182,  ...,  0.3965, -0.6783, -1.2184]]],
       grad_fn=<ViewBackward0>)