In [92]:
import pandas as pd

df = pd.read_csv("data/uniprotkb_reviewed_true_2023_09_25.tsv", sep="\t")
df = df.dropna().reset_index(drop=True)

In [99]:
import re

def remove_pubmed_pattern_1(text):
    pattern = re.compile(r'\s+\(PubMed:\d+(?:, PubMed:\d+)*\)')
    result = re.sub(pattern, '', text)
    return result

def remove_pubmed_pattern_2(text):
    # pattern = re.compile(r'\s+\{ECO:\d+\|[^:]+:[^,]+(?:,\s*ECO:\d+\|[^:]+:[^,]+)*\}[\.;]')
    pattern = re.compile(r'\{ECO:\d+(?:\|\w+:[^,]+(?:,\s*\w+:[^,]+)*)*\}[\.;]')
    result = re.sub(pattern, '', text)
    result = result.replace(";", "")
    return result

def preproc_text(x):
    x = str(x)
    x = remove_pubmed_pattern_1(x)
    x = remove_pubmed_pattern_2(x)

    return x

# Example usage:
# original_text = df.loc[414, "function"]
# original_text = 'Component of the nascent polypeptide-associated complex (NAC), a dynamic component of the ribosomal exit tunnel, protecting the emerging polypeptides from interaction with other cytoplasmic proteins to ensure appropriate nascent protein targeting. The NAC complex also promotes mitochondrial protein import by enhancing productive ribosome interactions with the outer mitochondrial membrane and blocks the inappropriate interaction of ribosomes translating non-secretory nascent polypeptides with translocation sites in the membrane of the endoplasmic reticulum. EGD1 may act as a transcription factor that exert a negative effect on the expression of several genes that are transcribed by RNA polymerase II. {ECO:0000250}.'
# modified_text = preproc_text(original_text)
# modified_text = remove_pubmed_pattern_1(original_text)
# modified_text = remove_pubmed_pattern_2(modified_text)

# print("Original Text:", original_text)

# print()
# print("Modified Text:", modified_text)

df['function'] = df['function'].map(preproc_text)
df = df.dropna().reset_index(drop=True)
df.to_csv("data/prepared_uniprot.csv", index=False)

In [100]:
from datasets import load_dataset

custom_dataset = load_dataset("csv", data_files="data/prepared_uniprot.csv", split="train")
custom_dataset = custom_dataset.remove_columns(["entry", "entry_name", "protein_name", "sequence"])
custom_dataset = custom_dataset.train_test_split(test_size=0.2, seed=42)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [102]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["function"], padding="max_length", truncation=True)

tokenized_datasets = custom_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/371516 [00:00<?, ? examples/s]

Map:   0%|          | 0/92879 [00:00<?, ? examples/s]

In [103]:
tokenized_datasets.save_to_disk("prepared_uniprot")

Saving the dataset (0/3 shards):   0%|          | 0/371516 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/92879 [00:00<?, ? examples/s]