In [2]:
import pandas as pd
import numpy as np
import gzip
from tqdm.notebook import tqdm
import re

In [3]:
import torch

In [104]:
uniprot_dat_file = "/projects/deepgreen/jlaw/inputs/uniprot/2022-03/uniprot_sprot.dat.gz"
# uniprot_dat_file = "/projects/deepgreen/jlaw/inputs/uniprot/2022-03/P09884.txt"

In [119]:
# these are a few example features lines:
# FT   CHAIN           1..1339
# FT                   /note="DNA polymerase alpha catalytic subunit"
# FT                   /id="PRO_0000046433"
# FT   ZN_FING         1179..1216
# FT                   /note="CysA-type"
# FT   REGION          1..90
# FT                   /note="Disordered"
# FT                   /evidence="ECO:0000256|SAM:MobiDB-lite"
def parse_feature_lines(feature_lines):
    """ parse the FT lines of an entry
    """
    features = {}
    for field_text in feature_lines.split('\n'):
        if field_text[0] != ' ':
            feature_id, seq_pos = field_text.split()
            features[(feature_id, seq_pos)] = {}
        else:
            # remove the leading spaces, and remove quotes
            field_text = field_text.lstrip().replace('"','')
            if field_text[0] == '/' and '=' in field_text.split()[0]:
                note_id, note = field_text.split('=', maxsplit=1)
                note_id = note_id[1:]
                features[(feature_id, seq_pos)][note_id] = note
            else:
                features[(feature_id, seq_pos)][note_id] += ' ' + field_text
    # extract out the evidence and sources
    for key, notes in features.items():
        if 'evidence' in notes:
            ev_codes = []
            sources = []
            for ev in notes['evidence'].split(', '):
                if '|' in ev:
                    ev_code, source = ev.split('|')
                else:
                    ev_code = ev
                    source = ""
                ev_codes.append(ev_code)
                sources.append(source)
                # sources are optional for a few evidence codes:
                # https://www.uniprot.org/help/evidence_table
#             if len(ev_codes) != len(sources):
#                 print(f"some ev_codes have no sources: {notes['evidence']}")
            notes['evidence'] = ','.join(ev_codes)
            notes['sources'] = ','.join(sources)
            
    df = pd.DataFrame(features).T
    df.index.names = ['feature', 'sites']
    return df
    

In [52]:
def parse_fields(fields):
    parsed = {}
    # example line: 
    # AC   P09884; Q86UQ7;
    parsed['uniprot_id'] = fields['AC'].split(';')[0]
    # example line:
    # OX   NCBI_TaxID=9606;
    parsed['taxon_id'] = fields['OX'].split('NCBI_TaxID=')[1].split(';')[0]
    
    feat_df = parse_feature_lines(fields['FT'])
    df = pd.concat({parsed['uniprot_id']: feat_df}, names=["uniprot_id"])
    df = pd.concat({parsed['taxon_id']: df}, names=["taxon_id"])
    
    return df

In [120]:
uniprot_data = []
with gzip.open(uniprot_dat_file, 'r') as f:
    fields = {}
    for line in tqdm(f, total=72687654):
        line = line.decode().rstrip()
        # split line into field identifier and content
        field_id = line[:2]
        field_text = line[5:]
#         # also keep track of the feature ID
#         feature_id = None
        
        # if field is not end-of-record
        if field_id != "//":
            # store field content, appending if the field already exists (split over multiple lines)
            if field_id in fields:
                fields[field_id] += '\n' + field_text
            else:
                fields[field_id] = field_text
            
        else:
            uniprot_data.append(parse_fields(fields))
            fields = {}
            
uniprot_data = pd.concat(uniprot_data)
uniprot_data.head()        

  0%|          | 0/72687654 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,id,note,evidence,sources
taxon_id,uniprot_id,feature,sites,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
654924,Q6GZX4,CHAIN,1..256,PRO_0000410512,Putative transcription factor 001R,,
654924,Q6GZX3,CHAIN,1..320,PRO_0000410509,Uncharacterized protein 002L,,
654924,Q6GZX3,TRANSMEM,301..318,,Helical,ECO:0000255,
654924,Q6GZX3,REGION,261..294,,Disordered,ECO:0000256,SAM:MobiDB-lite
345201,Q197F8,CHAIN,1..458,PRO_0000377938,Uncharacterized protein 002R,,


In [122]:
import os
out_file = os.path.dirname(uniprot_dat_file) + '/uniprot_seq_features.csv.gz'
print(f"writing: {out_file}")
uniprot_data.to_csv(out_file)

writing: /projects/deepgreen/jlaw/inputs/uniprot/2022-03/uniprot_seq_features.csv.gz


In [103]:
df = uniprot_data.reset_index()
df[df['feature'] == "MOD_RES"]

Unnamed: 0,taxon_id,uniprot_id,feature,sites,note,id,evidence,sources
19,9606,P09884,MOD_RES,174,Phosphothreonine,,ECO:0007744,"PubMed:19690332,PubMed:20068231"
20,9606,P09884,MOD_RES,186,Phosphoserine,,ECO:0007744,"PubMed:18669648,PubMed:19690332,PubMed:2006823..."
21,9606,P09884,MOD_RES,190,Phosphoserine,,ECO:0007744,"PubMed:18669648,PubMed:20068231,PubMed:23186163"
22,9606,P09884,MOD_RES,209,Phosphoserine,,ECO:0007744,"PubMed:18669648,PubMed:20068231,PubMed:2140669..."
23,9606,P09884,MOD_RES,224,N6-acetyllysine,,ECO:0000250,UniProtKB:P33609
24,9606,P09884,MOD_RES,406,Phosphothreonine,,ECO:0007744,PubMed:23186163
25,9606,P09884,MOD_RES,970,N6-succinyllysine,,ECO:0000250,UniProtKB:P33609


In [84]:
df = uniprot_data.reset_index()
df['uniprot_id'].unique()

array(['P09884', 'O00874'], dtype=object)

In [85]:
df

Unnamed: 0,taxon_id,uniprot_id,feature,sites,note,id,evidence
0,9606,P09884,CHAIN,1..1462,"""DNA polymerase alpha catalytic subunit""","""PRO_0000046428""",
1,9606,P09884,ZN_FING,1283..1318,"""CysA-type""",,"""ECO:0000250|UniProtKB:P15436"""
2,9606,P09884,REGION,1..33,"""Disordered""",,"""ECO:0000256|SAM:MobiDB-lite"""
3,9606,P09884,REGION,98..123,"""Disordered""",,"""ECO:0000256|SAM:MobiDB-lite"""
4,9606,P09884,REGION,232..251,"""Disordered""",,"""ECO:0000256|SAM:MobiDB-lite"""
...,...,...,...,...,...,...,...
137,5661,O00874,METAL,1216,"""Zinc""",,"""ECO:0000250"""
138,5661,O00874,METAL,1233,"""Iron-sulfur (4Fe-4S)""",,"""ECO:0000250"""
139,5661,O00874,METAL,1243,"""Iron-sulfur (4Fe-4S)""",,"""ECO:0000250"""
140,5661,O00874,METAL,1271,"""Iron-sulfur (4Fe-4S)""",,"""ECO:0000250"""
