In [39]:
import pandas as pd
import json

In [40]:
with open('starling_complete_data.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)
data = data['records']
print(len(data))

2211


In [41]:
LANGUAGES = [
    'Proto-Dravidian',
    'Proto-South Dravidian',
    'Proto-North Dravidian',
    'Proto-North-Dravidian',
    'Proto-Telugu',
    'Proto-Gondi-Kui',
    'Proto-Kolami-Gadba',
    'Proto-Nilgiri',
    'Proto-Kui-Kuwi',
    'Proto-Gondi',
    'Proto-Pengo-Manda',
    'Tamil',
    'Malayalam',
    'Kannada',
    'Telugu',
    'Tulu',
    'Kodagu',
    'Kota',
    'Toda',
    'Konda',
    'Parji',
    'Kui',
    'Kurukh',
    'Kolami',
    'Malto',
    'Naikri',
    'Pengo',
    'Brahui',
    'Koya Gondi',
    'Kuwi (Schulze)',
    'Kuwi (Fitzgerald)',
    'Kuwi (Israel)',
    'Muria Gondi',
    'Maria Gondi',
    'Betul Gondi',
    'Sunkarametta Kuwi',
    'Adilabad Gondi',
    'Salur Gadba',
    'Manda',
    'Ollari Gadba',
    'Naiki',
    'Mandla Gondi (Phailbus)',
    'Maria Gondi (Mitchell)',
    'Mandla Gondi (Williamson)',
    'Seoni Gondi',
    'Telugu (Krishnamurti)',
    'Kondekor Gadba',
    'Gommu Gondi',
    'Kinwat Kolami',
    'Kolami (Setumadhava Rao)',
    'Parja Kuwi',
    'Yeotmal Gondi',
    'Maria Gondi (Lind)',
    'Poya Gadba',
    'Chindwara Gondi',
    'Khuttia Kui',
    'Konda (Burrow/Bhattacharya)',
    'Maria Gondi (Smith)',
    'Kuwi (Mahanti)',
    'Tekriya Kuwi',
    'Inscriptional Telugu',
    'Durg Gondi',
    'Chanda Gondi',
    'Dongriya Kuwi',
    'Mandla Gondi',
    'Irula',
    'Merolu Telugu',
    'Kasaba'
]

LANGUAGE_ABBREV = {
    'Proto-Dravidian': 'PDR',
    'Proto-South Dravidian': 'PSD',
    'Proto-North Dravidian': 'PND',
    'Proto-North-Dravidian': 'PND',
    'Proto-Telugu': 'PTE',
    'Proto-Gondi-Kui': 'PGK',
    'Proto-Kolami-Gadba': 'PKG',
    'Proto-Nilgiri': 'PNI',
    'Proto-Kui-Kuwi': 'PKK',
    'Proto-Gondi': 'PGO',
    'Proto-Pengo-Manda': 'PPM',
    'Tamil': 'TAM',
    'Malayalam': 'MAL',
    'Kannada': 'KAN',
    'Telugu': 'TEL',
    'Tulu': 'TUL',
    'Kodagu': 'KOD',
    'Kota': 'KOT',
    'Toda': 'TOD',
    'Konda': 'KON',
    'Parji': 'PAR',
    'Kui': 'KUI',
    'Kurukh': 'KUR',
    'Kolami': 'KOL',
    'Malto': 'MLT',
    'Naikri': 'NAI',
    'Pengo': 'PEN',
    'Brahui': 'BRA',
    'Koya Gondi': 'KGO',
    'Kuwi (Schulze)': 'KWS',
    'Kuwi (Fitzgerald)': 'KWF',
    'Kuwi (Israel)': 'KWI',
    'Muria Gondi': 'MGO',
    'Maria Gondi': 'MRG',
    'Betul Gondi': 'BGO',
    'Sunkarametta Kuwi': 'SKW',
    'Adilabad Gondi': 'AGO',
    'Salur Gadba': 'SGA',
    'Manda': 'MAN',
    'Ollari Gadba': 'OGA',
    'Naiki': 'NAK',
    'Mandla Gondi (Phailbus)': 'MGP',
    'Maria Gondi (Mitchell)': 'MGM',
    'Mandla Gondi (Williamson)': 'MGW',
    'Seoni Gondi': 'SGO',
    'Telugu (Krishnamurti)': 'TEK',
    'Kondekor Gadba': 'KGA',
    'Gommu Gondi': 'GMG',
    'Kinwat Kolami': 'KKO',
    'Kolami (Setumadhava Rao)': 'KSR',
    'Parja Kuwi': 'PKW',
    'Yeotmal Gondi': 'YGO',
    'Maria Gondi (Lind)': 'MGL',
    'Poya Gadba': 'PGA',
    'Chindwara Gondi': 'CGO',
    'Khuttia Kui': 'KKU',
    'Konda (Burrow/Bhattacharya)': 'KBB',
    'Maria Gondi (Smith)': 'MGS',
    'Kuwi (Mahanti)': 'KWM',
    'Tekriya Kuwi': 'TKW',
    'Inscriptional Telugu': 'ITE',
    'Durg Gondi': 'DGO',
    'Chanda Gondi': 'CHG',
    'Dongriya Kuwi': 'DKW',
    'Mandla Gondi': 'MDG',
    'Irula': 'IRU',
    'Merolu Telugu': 'MTE',
    'Kasaba': 'KAS'
}

ETYMOLOGY_FIELDS = [
    'Dravidian etymology',
    'Gondwan etymology',
    'South Dravidian etymology'
]

METADATA_FIELDS = [
    'Additional forms',
    'Additional Forms',
    'Dialectal forms (1)',
    'Dialectal forms (2)',
    'Dialectal forms (3)',
    'Dialectal forms (4)',
    'Miscellaneous',
    'Notes on correspondences',
    'Stems'
]


In [43]:
rows = []
language_counters = {lang: 0 for lang in LANGUAGES}

def generate_id(language):
    language_counters[language] += 1
    abbrev = LANGUAGE_ABBREV.get(language, 'UNK')
    return f"{abbrev}{language_counters[language]:03d}"

def extract_head_and_reflexes(entry, parent_id=None, parent_word=None, parent_lang=None):
    head_word = None
    head_lang = None
    head_meaning = entry.get('Meaning', '')
    
    for lang in LANGUAGES:
        if lang in entry and entry[lang]:
            head_word = entry[lang]
            head_lang = lang
            break
    
    if not head_word:
        return
    
    current_id = generate_id(head_lang)
    
    etymology = ''
    for etym_field in ETYMOLOGY_FIELDS:
        if etym_field in entry:
            etymology = etym_field
            break
    
    row_data = {
        'ID': current_id,
        'Headword': head_word,
        'Meaning': head_meaning,
        'Language': head_lang,
        'Parent Word ID': parent_id,
        'Parent Word': parent_word,
        'Parent Language': parent_lang,
        'Notes': entry.get('Notes', ''),
        'Number in DED': entry.get('Number in DED', ''),
        'Number in CVOTGD': entry.get('Number in CVOTGD', ''),
        'URL': entry.get('_url', ''),
        'Depth': entry.get('_depth', 0)
    }
    
    if etymology:
        row_data['Etymology'] = etymology
    
    for meta_field in METADATA_FIELDS:
        if meta_field in entry:
            row_data[meta_field] = entry[meta_field]
    
    head_derivates_field = f'{head_lang} derivates'
    if head_derivates_field in entry:
        row_data['Derivates'] = entry[head_derivates_field]
    
    rows.append(row_data)
    
    lang_started = False
    for lang in LANGUAGES:
        if lang == head_lang:
            lang_started = True
            continue
        
        if lang_started and lang in entry and entry[lang]:
            reflex_id = generate_id(lang)
            
            meaning_field = f'{lang} meaning'
            derivates_field = f'{lang} derivates'
            
            reflex_meaning = entry.get(meaning_field, head_meaning)
            
            reflex_data = {
                'ID': reflex_id,
                'Headword': entry[lang],
                'Meaning': reflex_meaning,
                'Language': lang,
                'Parent Word ID': current_id,
                'Parent Word': head_word,
                'Parent Language': head_lang,
                'Notes': '',
                'Number in DED': entry.get('Number in DED', ''),
                'Number in CVOTGD': '',
                'URL': entry.get('_url', ''),
                'Depth': entry.get('_depth', 0)
            }
            
            if derivates_field in entry:
                reflex_data['Derivates'] = entry[derivates_field]
            
            rows.append(reflex_data)
    
    if '_sub_entries' in entry:
        for sub_entry in entry['_sub_entries']:
            extract_head_and_reflexes(sub_entry, current_id, head_word, head_lang)
            
# for record in data:
#     if '_sub_entries' in record:
#         for sub_entry in record['_sub_entries']:
#             extract_head_and_reflexes(sub_entry)
#     else:
#         extract_head_and_reflexes(record)

# df = pd.DataFrame(rows)
# df.to_excel('output.xlsx', index=False, engine='openpyxl')
# testing below to see
extract_head_and_reflexes(data[1])
df = pd.DataFrame(rows)
df

Unnamed: 0,ID,Headword,Meaning,Language,Parent Word ID,Parent Word,Parent Language,Notes,Number in DED,Number in CVOTGD,URL,Depth,Etymology,Additional forms
0,PDR001,*ac-,thorn; to cut,Proto-Dravidian,,,,Dubious (we are dealing essentially with a goo...,,,,0,,
1,PND001,*ac,thorn; to cut,Proto-North Dravidian,PDR001,*ac-,Proto-Dravidian,,,,,0,,
2,PGK001,*ac-,thorn; to cut,Proto-Gondi-Kui,PDR001,*ac-,Proto-Dravidian,,,,,0,,
3,PGK002,*ac-,to cut (hand; meat),Proto-Gondi-Kui,PDR001,*ac-,Proto-Dravidian,,,,https://starlingdb.org/cgi-bin/response.cgi?si...,1,Dravidian etymology,
4,PGO001,*ac- (pl. action*ask-),to cut (hand; meat),Proto-Gondi,PGK002,*ac-,Proto-Gondi-Kui,,,,https://starlingdb.org/cgi-bin/response.cgi?si...,1,,
5,PGO002,*ac- (pl. action*ask-),to cut,Proto-Gondi,PGK002,*ac-,Proto-Gondi-Kui,,46.0,17.0,https://starlingdb.org/cgi-bin/response.cgi?si...,2,Gondwan etymology,"Also Gondi_Traskānā, Gondi_Muask-to cut (meat)..."
6,MGO001,"acc- ""to split, saw""",to cut,Muria Gondi,PGO002,*ac- (pl. action*ask-),Proto-Gondi,,46.0,,https://starlingdb.org/cgi-bin/response.cgi?si...,2,,
7,MRG001,"ask- ""to cut (meat), carve""",to cut,Maria Gondi,PGO002,*ac- (pl. action*ask-),Proto-Gondi,,46.0,,https://starlingdb.org/cgi-bin/response.cgi?si...,2,,
8,BGO001,"achchānā ""to be cut (of one's foot on a stump,...",to cut,Betul Gondi,PGO002,*ac- (pl. action*ask-),Proto-Gondi,,46.0,,https://starlingdb.org/cgi-bin/response.cgi?si...,2,,
9,AGO001,"ask- ""to cut (meat), carve""",to cut,Adilabad Gondi,PGO002,*ac- (pl. action*ask-),Proto-Gondi,,46.0,,https://starlingdb.org/cgi-bin/response.cgi?si...,2,,
