## BGC training samples on S3:
`aws s3 ls s3://share.jgi-ga.org/satria/for_bgc_gpt/ --recursive`

In [1]:
import json
import pickle
import pandas as pd
import numpy as np

# Read JSON data from file
with open('../miBIG/S3/data-1.json', 'r') as json_file:
    json_data = json.load(json_file)

json_dict = {}
for k in range(len(json_data[:])):
    d = json_data[k]
    d['seq'] = d['seq'].replace(' ', 'B')
    list(d['swiss'].keys())[0]
    uid = 'BGC' + list(d['swiss'].keys())[0] + '-' + str(k)
    d['swiss'] = dict(zip([uid], d['swiss'].values()))
    json_dict[uid] = d

In [2]:
vocab_df = pd.read_csv('mapping_files/vocab.txt', sep=' ',header=None)
vocab_df.columns = ['key','value']
vocab_df

Unnamed: 0,key,value
0,472,0
1,812,1
2,1133,2
3,808,3
4,175,4
...,...,...
129402,W,129402
129403,X,129403
129404,Y,129404
129405,Z,129405


In [3]:
kw = [(json_dict[uid]['swiss'][uid]['kw'], uid) for uid in json_dict.keys()]
kw_types = np.unique([k[0][0] for k in kw])
kw_types

array(['GCF-244878', 'GCF-245373', 'GCF-245671', 'GCF-245936',
       'GCF-246515', 'GCF-247890'], dtype='<U10')

In [4]:
kw_types_uid = {}
for k in kw:
    if k[0][0] not in kw_types_uid:
        kw_types_uid[k[0][0]] = k[1]

kw_map = dict(zip(kw_types, vocab_df['key'][:len(kw_types)]))
kw_map

{'GCF-244878': '472',
 'GCF-245373': '812',
 'GCF-245671': '1133',
 'GCF-245936': '808',
 'GCF-246515': '175',
 'GCF-247890': '732'}

## Before replacement

In [5]:
for uid in kw_types_uid.values():
    print(json_dict[uid])

{'swiss': {'BGC82598-0': {'ex': 1, 'kw': ['GCF-244878', 'RiPP/bacteriocin'], 'taxa': []}}, 'seq': 'MNGKRNIFTCISIIGIGLASFSSFSFAANVTDNSVQNSIPVVNQQVAAAKEMKPFPQQVNYAGVIKPTHVTQESLNASVRSYYDNWKKKYLKNDLSSLPGGYYVKGEITGDADGFKPLGTSEGQGYGMIITVLMAGYDSNAQKIYDGLFKTARTFKSSQNPNLMGWVVADSKKAQGHFDSATDGDLDIAYSLLLAHKQWGSNGTVNYLKEAQDMITKGIKASNVTNNSRLNLGDWDSKNSLDTRPSDWMMSHLRAFYEFTGDKTWLTVINNLYDVYTQFSNKYSPNTGLISDFVVKNPPQPAPKDFLEESEYTNAYYYNASRVPLRIVMDYAMYGEKRSKVISDKVSSWIQNKTNGNPSKIVDGYQLNGSNIGSYSTAVFVSPFIAASITSSNNQKWVNSGWDWMKNKRESYFSDSYNLLTMLFITGNWWKPVPDDKKIQNQINDAIYEGYDNBMEKVLFFGDPGIDDSFAIMYGLLHPEIEIVGIVTGYGNVEHIHAAHNAAYILQLANRQ'}
{'swiss': {'BGC248950-282': {'ex': 1, 'kw': ['GCF-245936', 'Other/butyrolactone'], 'taxa': []}}, 'seq': 'MEILKPQPTSKAPSDWFTGDVWWDVIYAGQEPSRMRANMVRFAPCARTDWHSHALGQTLHIVSGTALVQARGGEIVEVHPGETVYTPAGEEHWHGAAPDRFMSHLALWEGPGDGGTETTWGDKVTDEEYGGPRARRRBMTDYDDLDMFGGLDASSLPLRQQQILMTIRDLVAANGCTPSTRQIGDAVGLRSTSTVSKHLKSLEEKGFLRRGAAMARQLDVRPFLVGAKEQSSSNTVTVPVVGDIAAGAPILAEEHADEVLALPRELVGSGTVFGLRVRGES

## Replacing `kw` and `taxa`

In [6]:
for uid in json_dict.keys():
    json_dict[uid]['swiss'][uid]['taxa'] = [9606] # homo sapiens taxonomy id from NCBI: https://www.ncbi.nlm.nih.gov/taxonomy
    kw_tmp = json_dict[uid]['swiss'][uid]['kw']
    json_dict[uid]['swiss'][uid]['kw'] = [kw_map[kw_tmp[0]]]

## After replacement

In [7]:
for uid in kw_types_uid.values():
    print(json_dict[uid])

{'swiss': {'BGC82598-0': {'ex': 1, 'kw': ['472'], 'taxa': [9606]}}, 'seq': 'MNGKRNIFTCISIIGIGLASFSSFSFAANVTDNSVQNSIPVVNQQVAAAKEMKPFPQQVNYAGVIKPTHVTQESLNASVRSYYDNWKKKYLKNDLSSLPGGYYVKGEITGDADGFKPLGTSEGQGYGMIITVLMAGYDSNAQKIYDGLFKTARTFKSSQNPNLMGWVVADSKKAQGHFDSATDGDLDIAYSLLLAHKQWGSNGTVNYLKEAQDMITKGIKASNVTNNSRLNLGDWDSKNSLDTRPSDWMMSHLRAFYEFTGDKTWLTVINNLYDVYTQFSNKYSPNTGLISDFVVKNPPQPAPKDFLEESEYTNAYYYNASRVPLRIVMDYAMYGEKRSKVISDKVSSWIQNKTNGNPSKIVDGYQLNGSNIGSYSTAVFVSPFIAASITSSNNQKWVNSGWDWMKNKRESYFSDSYNLLTMLFITGNWWKPVPDDKKIQNQINDAIYEGYDNBMEKVLFFGDPGIDDSFAIMYGLLHPEIEIVGIVTGYGNVEHIHAAHNAAYILQLANRQ'}
{'swiss': {'BGC248950-282': {'ex': 1, 'kw': ['808'], 'taxa': [9606]}}, 'seq': 'MEILKPQPTSKAPSDWFTGDVWWDVIYAGQEPSRMRANMVRFAPCARTDWHSHALGQTLHIVSGTALVQARGGEIVEVHPGETVYTPAGEEHWHGAAPDRFMSHLALWEGPGDGGTETTWGDKVTDEEYGGPRARRRBMTDYDDLDMFGGLDASSLPLRQQQILMTIRDLVAANGCTPSTRQIGDAVGLRSTSTVSKHLKSLEEKGFLRRGAAMARQLDVRPFLVGAKEQSSSNTVTVPVVGDIAAGAPILAEEHADEVLALPRELVGSGTVFGLRVRGESMVDAAICDGDVVVVRRQDEAHSGEIVAAMIDGEATVKVLRRRDGHVYLE

In [8]:
# Convert Python object to pickle
with open('../miBIG/S3_pickle/data-1.pickle', 'wb') as pickle_file:
    pickle.dump(json_dict, pickle_file)