In [215]:
import pandas as pd
import re
import os

In [218]:
with open("uniprot-reviewed_yes+AND+organism__Homo+sapiens+(Human)+[9606]_.gff", "r") as f:
    lines = f.readlines()
# write a temp file
with open("uniprot-all_temp.txt", "w") as f:
    for line in lines:
        # delete lines start with '##', e.g.'##gff-version' and '##sequence-region'
        if not line.startswith('##'):
            # remove last \t
            line = line.rstrip() + '\n'
            f.write(line)

In [219]:
df = pd.read_csv('uniprot-all_temp.txt', sep='\t', header=None, names=['sequence', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])

In [220]:
df = df[df['feature'] == 'Modified residue']


In [221]:
df_output = pd.DataFrame(columns = ["uniprot_entry", "uniprot_accession", "position", "type", "pubmed_ids", "sequence"])

In [222]:
# find type name by "Note=..."
def get_type_value(attributes):
    split_by_note = attributes.split('Note=')
    if (len(split_by_note) >= 2):
        return split_by_note[1].split(';')[0]
    else:
        return ''

In [223]:
def get_pubmed_ids(attributes):
    s = set()
    # find number after PubMed:
    match_pubmed = re.findall('PubMed:(\d+)', attributes)
    # find number after PMID:
    match_pmid = re.findall('PMID:(\d+)', attributes)
    if match_pubmed:
        s.update(match_pubmed)
    if match_pmid:
        s.update(match_pmid)
    # concatenate ids with ';'
    return ';'.join(str(e) for e in s)

In [224]:
df_output['uniprot_accession'] = df['sequence']
df_output['position'] = df.apply(lambda row: row['start'] if (row['start'] == row['end']) else str(row['start']) +';'+ str(row['end']), axis = 1) 
df_output['type'] = df.apply(lambda row: get_type_value(row['attributes']), axis = 1)
df_output['pubmed_ids'] = df.apply(lambda row: get_pubmed_ids(row['attributes']), axis = 1)


In [225]:
# no uniprot_entry in GFF
df_output["uniprot_entry"] = ''
# no sequence in GFF
df_output["sequence"] = ''

In [229]:
os.remove("uniprot-all_temp.txt")
df_output.to_csv('uniprot_output.txt', index=False, sep='\t', header=False)