Author: Irsyad Adam

In [None]:
import pandas as pd
from tqdm import tqdm
import requests

In [37]:
def extract_pmid_from_unid(unid) -> str:
    """
    grabs all pmids
    @param id is the uniprot id that is going to be grabbed
    @return is a list with all of the pmids associated EXCLUDING THE NUCLETIDE SEQ
    """
    #get the url
    url = 'https://www.uniprot.org/uniprot/' + unid + '.txt'

    #check the response
    response = requests.get(url=url)

    #if successful
    if response.status_code == 200:
        pmid_list = []
        #new line delimiter
        response = response.text.splitlines()
        #search
        for i in range(len(response)):
            #RX is the section for pmids want to exclude the ones that say nucleotide sequence
            if ('RX' in response[i]) and (('NUCLEOTIDE SEQUENCE' not in response[i - 1]) and ('NUCLEOTIDE SEQUENCE' not in response[i - 2]) and ('NUCLEOTIDE SEQUENCE' not in response[i - 3])):
                #process string
                pmid = response[i]
                pmid = str(pmid.split()[1][7:-1])
                pmid_list.append(pmid)
        return pmid_list

    #if not successful
    else:
        #get error
        print('Error, Status Code:' % response.status_code)

def unid_pmid_to_df(io = "edge_list.csv") -> pd.DataFrame:
    """
    takes a csv file, gets the uniprot id column, and gets every pmid from that uniprotid, excluding the 'nucleotide
    sequences'
    @param io is the csv file to get parsed
    @return df is the df with uniprotid-pmid
    """
    df = pd.read_csv(io)
    #get all unique identifiers
    unid = list(set(df["UNIPROT_ID"]))
    print("---Import Completed---", flush = True)
    pmid_list = []
    #iterate
    for element in tqdm(unid, desc = "Extracting PMIDs: "):
        pmid_list.append(extract_pmid_from_unid(element))
    #print
    print("----Extraction Done---", flush = True)
    dict_to_df = {"UNIPROT_ID" : unid, "PMID" : pmid_list}
    return pd.DataFrame(dict_to_df)

In [36]:
df = unid_pmid_to_df()
df

---Import Completed---


Extracting PMIDs:   2%|▏         | 20/828 [00:08<05:57,  2.26it/s]


KeyboardInterrupt: 