In [4]:
import pandas as pd 

def readPlatform(filename:str):
    f = open(filename, 'r')
    l = f.readline()
    while('!platform_table_begin' not in l):
        l = f.readline()
    df = pd.read_csv(f, delimiter='\t')
    return df

In [5]:
platform = readPlatform('GPL3154.annot')
platform.to_csv('platform.csv', sep='\t')

In [35]:
import re

def platflocation(locstr:str) -> dict:
    loclist = []
    if locstr != 'nan':
        # print(locstr)
        loc = locstr.split('///')
        for l in loc:
            id = l.split('(')[0].strip().replace('.', '_')
            start = re.findall('(?<=\().*(?=\.\.)', l)[0]
            end = 0
            strand = '+'
            if 'complement' in l:
                end = re.findall('(?<=\.\.).*(?=\,)', l)[0]
                strand = '-'
            else:
                end = re.findall('(?<=\.\.).*(?=\))', l)[0]
            loclist.append(
                {
                    '_id': id,
                    '_start': start,
                    '_end': end,
                    '_strand': strand
                }
            )
    return loclist

In [38]:
import pandas as pd 

def platform(fn:str) -> dict:
    f = open(fn, 'r')
    l = f.readline()
    while('!platform_table_begin' not in l):
        l = f.readline()
    df = pd.read_csv(f, delimiter='\t')
    platfAcc = fn.split('.')[0]
    platf = {platfAcc: []}
    for i,r in df.iterrows():
        entry = {
            'id': str(r['ID']).strip(),
            'gene_symbol': str(r['Gene symbol']).strip().split('///'),
            'orf': str(r['Platform_ORF']).strip(),
            'location': platflocation(str(r['Chromosome annotation']).strip())
        }
        platf[platfAcc].append(entry)
    return platf

In [39]:
p = platform('GPL3154.annot')
p

{'GPL3154': [{'id': '1759068_at',
   'gene_symbol': ['c4728'],
   'orf': 'c4728',
   'location': [{'_id': 'NC_004431_1',
     '_start': '4495623',
     '_end': '4497122',
     '_strand': '-'}]},
  {'id': '1759069_at',
   'gene_symbol': ['mrr'],
   'orf': 'b4351',
   'location': [{'_id': 'NC_000913_3',
     '_start': '4586949',
     '_end': '4587863',
     '_strand': '+'}]},
  {'id': '1759070_s_at', 'gene_symbol': ['nan'], 'orf': 'nan', 'location': []},
  {'id': '1759071_s_at',
   'gene_symbol': ['yfgF', 'Z3766', 'yfgF', 'ECs3365'],
   'orf': 'b2503',
   'location': [{'_id': 'NC_004431_1',
     '_start': '2883659',
     '_end': '2885902',
     '_strand': '-'},
    {'_id': 'NC_002655_2',
     '_start': '3412673',
     '_end': '3414916',
     '_strand': '-'},
    {'_id': 'NC_000913_3',
     '_start': '2626695',
     '_end': '2628938',
     '_strand': '-'},
    {'_id': 'NC_002695_1',
     '_start': '3342837',
     '_end': '3345080',
     '_strand': '-'}]},
  {'id': '1759072_s_at',
   'gene

In [53]:
from jsonpath_ng.ext import parse

[match.value for match in parse('$.GPL3154[?(@.gene_symbol[*]="yfgF")]').find(p)]

[{'id': '1759071_s_at',
  'gene_symbol': ['yfgF', 'Z3766', 'yfgF', 'ECs3365'],
  'orf': 'b2503',
  'location': [{'_id': 'NC_004431_1',
    '_start': '2883659',
    '_end': '2885902',
    '_strand': '-'},
   {'_id': 'NC_002655_2',
    '_start': '3412673',
    '_end': '3414916',
    '_strand': '-'},
   {'_id': 'NC_000913_3',
    '_start': '2626695',
    '_end': '2628938',
    '_strand': '-'},
   {'_id': 'NC_002695_1',
    '_start': '3342837',
    '_end': '3345080',
    '_strand': '-'}]}]

In [48]:
[match.value for match in parse('$.GPL3154[?@.location[*]._id="NC_000913_3"]').find(p)]

[{'id': '1759069_at',
  'gene_symbol': ['mrr'],
  'orf': 'b4351',
  'location': [{'_id': 'NC_000913_3',
    '_start': '4586949',
    '_end': '4587863',
    '_strand': '+'}]},
 {'id': '1759071_s_at',
  'gene_symbol': ['yfgF', 'Z3766', 'yfgF', 'ECs3365'],
  'orf': 'b2503',
  'location': [{'_id': 'NC_004431_1',
    '_start': '2883659',
    '_end': '2885902',
    '_strand': '-'},
   {'_id': 'NC_002655_2',
    '_start': '3412673',
    '_end': '3414916',
    '_strand': '-'},
   {'_id': 'NC_000913_3',
    '_start': '2626695',
    '_end': '2628938',
    '_strand': '-'},
   {'_id': 'NC_002695_1',
    '_start': '3342837',
    '_end': '3345080',
    '_strand': '-'}]},
 {'id': '1759073_at',
  'gene_symbol': ['stfP'],
  'orf': 'b1154',
  'location': [{'_id': 'NC_000913_3',
    '_start': '1207501',
    '_end': '1208130',
    '_strand': '+'}]},
 {'id': '1759074_at',
  'gene_symbol': ['yfbN'],
  'orf': 'b2273',
  'location': [{'_id': 'NC_000913_3',
    '_start': '2387710',
    '_end': '2388426',
    '