In [5]:
import pandas as pd
import json

In [6]:
with open('starling_complete_data.json', 'r', encoding='utf-8-sig') as f:
    data = json.load(f)
data = data['records']
print(len(data))

2211


In [7]:
LANGUAGES = [
    'Proto-Dravidian',
    'Proto-South Dravidian',
    'Proto-North Dravidian',
    'Proto-North-Dravidian',
    'Proto-Telugu',
    'Proto-Gondi-Kui',
    'Proto-Kolami-Gadba',
    'Proto-Nilgiri',
    'Proto-Kui-Kuwi',
    'Proto-Gondi',
    'Proto-Pengo-Manda',
    'Tamil',
    'Malayalam',
    'Kannada',
    'Telugu',
    'Tulu',
    'Kodagu',
    'Kota',
    'Toda',
    'Konda',
    'Parji',
    'Kui',
    'Kurukh',
    'Kolami',
    'Malto',
    'Naikri',
    'Pengo',
    'Brahui',
    'Koya Gondi',
    'Kuwi (Schulze)',
    'Kuwi (Fitzgerald)',
    'Kuwi (Israel)',
    'Muria Gondi',
    'Maria Gondi',
    'Betul Gondi',
    'Sunkarametta Kuwi',
    'Adilabad Gondi',
    'Salur Gadba',
    'Manda',
    'Ollari Gadba',
    'Naiki',
    'Mandla Gondi (Phailbus)',
    'Maria Gondi (Mitchell)',
    'Mandla Gondi (Williamson)',
    'Seoni Gondi',
    'Telugu (Krishnamurti)',
    'Kondekor Gadba',
    'Gommu Gondi',
    'Kinwat Kolami',
    'Kolami (Setumadhava Rao)',
    'Parja Kuwi',
    'Yeotmal Gondi',
    'Maria Gondi (Lind)',
    'Poya Gadba',
    'Chindwara Gondi',
    'Khuttia Kui',
    'Konda (Burrow/Bhattacharya)',
    'Maria Gondi (Smith)',
    'Kuwi (Mahanti)',
    'Tekriya Kuwi',
    'Inscriptional Telugu',
    'Durg Gondi',
    'Chanda Gondi',
    'Dongriya Kuwi',
    'Mandla Gondi',
    'Irula',
    'Merolu Telugu',
    'Kasaba'
]

LANGUAGE_ABBREV = {
    'Proto-Dravidian': 'PDR',
    'Proto-South Dravidian': 'PSD',
    'Proto-North Dravidian': 'PND',
    'Proto-North-Dravidian': 'PND',
    'Proto-Telugu': 'PTE',
    'Proto-Gondi-Kui': 'PGK',
    'Proto-Kolami-Gadba': 'PKG',
    'Proto-Nilgiri': 'PNI',
    'Proto-Kui-Kuwi': 'PKK',
    'Proto-Gondi': 'PGO',
    'Proto-Pengo-Manda': 'PPM',
    'Tamil': 'TAM',
    'Malayalam': 'MAL',
    'Kannada': 'KAN',
    'Telugu': 'TEL',
    'Tulu': 'TUL',
    'Kodagu': 'KOD',
    'Kota': 'KOT',
    'Toda': 'TOD',
    'Konda': 'KON',
    'Parji': 'PAR',
    'Kui': 'KUI',
    'Kurukh': 'KUR',
    'Kolami': 'KOL',
    'Malto': 'MLT',
    'Naikri': 'NAI',
    'Pengo': 'PEN',
    'Brahui': 'BRA',
    'Koya Gondi': 'KGO',
    'Kuwi (Schulze)': 'KWS',
    'Kuwi (Fitzgerald)': 'KWF',
    'Kuwi (Israel)': 'KWI',
    'Muria Gondi': 'MGO',
    'Maria Gondi': 'MRG',
    'Betul Gondi': 'BGO',
    'Sunkarametta Kuwi': 'SKW',
    'Adilabad Gondi': 'AGO',
    'Salur Gadba': 'SGA',
    'Manda': 'MAN',
    'Ollari Gadba': 'OGA',
    'Naiki': 'NAK',
    'Mandla Gondi (Phailbus)': 'MGP',
    'Maria Gondi (Mitchell)': 'MGM',
    'Mandla Gondi (Williamson)': 'MGW',
    'Seoni Gondi': 'SGO',
    'Telugu (Krishnamurti)': 'TEK',
    'Kondekor Gadba': 'KGA',
    'Gommu Gondi': 'GMG',
    'Kinwat Kolami': 'KKO',
    'Kolami (Setumadhava Rao)': 'KSR',
    'Parja Kuwi': 'PKW',
    'Yeotmal Gondi': 'YGO',
    'Maria Gondi (Lind)': 'MGL',
    'Poya Gadba': 'PGA',
    'Chindwara Gondi': 'CGO',
    'Khuttia Kui': 'KKU',
    'Konda (Burrow/Bhattacharya)': 'KBB',
    'Maria Gondi (Smith)': 'MGS',
    'Kuwi (Mahanti)': 'KWM',
    'Tekriya Kuwi': 'TKW',
    'Inscriptional Telugu': 'ITE',
    'Durg Gondi': 'DGO',
    'Chanda Gondi': 'CHG',
    'Dongriya Kuwi': 'DKW',
    'Mandla Gondi': 'MDG',
    'Irula': 'IRU',
    'Merolu Telugu': 'MTE',
    'Kasaba': 'KAS'
}

ETYMOLOGY_FIELDS = [
    'Dravidian etymology',
    'Gondwan etymology',
    'South Dravidian etymology'
]

METADATA_FIELDS = [
    'Additional forms',
    'Additional Forms',
    'Dialectal forms (1)',
    'Dialectal forms (2)',
    'Dialectal forms (3)',
    'Dialectal forms (4)',
    'Miscellaneous',
    'Notes on correspondences',
    'Stems'
]


In [8]:
rows = []
language_counters = {lang: 0 for lang in LANGUAGES}

def generate_id(language):
    language_counters[language] += 1
    abbrev = LANGUAGE_ABBREV.get(language, 'UNK')
    return f"{abbrev}{language_counters[language]:03d}"

def extract_head_and_reflexes(entry, parent_id=None, parent_word=None, parent_lang=None):
    # Find head word
    head_word = None
    head_lang = None
    head_meaning = entry.get('Meaning', '')
    
    for lang in LANGUAGES:
        if lang in entry and entry[lang]:
            head_word = entry[lang]
            head_lang = lang
            break
    
    if not head_word:
        return
    
    current_id = generate_id(head_lang)
    
    # Build set of (language, headword) pairs that appear in sub-entries
    sub_entry_lang_words = set()
    if '_sub_entries' in entry:
        for sub_entry in entry['_sub_entries']:
            for lang in LANGUAGES:
                if lang in sub_entry and sub_entry[lang]:
                    sub_entry_lang_words.add((lang, sub_entry[lang]))
                    break  # Only need the head of each sub-entry
    
    etymology = ''
    for etym_field in ETYMOLOGY_FIELDS:
        if etym_field in entry:
            etymology = etym_field
            break
    
    row_data = {
        'ID': current_id,
        'Headword': head_word,
        'Meaning': head_meaning,
        'Language': head_lang,
        'Parent Word ID': parent_id,
        'Parent Word': parent_word,
        'Parent Language': parent_lang,
        'Notes': entry.get('Notes', ''),
        'Number in DED': entry.get('Number in DED', ''),
        'Number in CVOTGD': entry.get('Number in CVOTGD', ''),
        'URL': entry.get('_url', ''),
        'Depth': entry.get('_depth', 0)
    }
    
    if etymology:
        row_data['Etymology'] = etymology
    
    for meta_field in METADATA_FIELDS:
        if meta_field in entry:
            row_data[meta_field] = entry[meta_field]
    
    head_derivates_field = f'{head_lang} derivates'
    if head_derivates_field in entry:
        row_data['Derivates'] = entry[head_derivates_field]
    
    rows.append(row_data)
    
    # Process reflexes - skip if they'll be handled by a sub-entry
    lang_started = False
    for lang in LANGUAGES:
        if lang == head_lang:
            lang_started = True
            continue
        
        if lang_started and lang in entry and entry[lang]:
            # Check if this (lang, word) pair will be handled by a sub-entry
            if (lang, entry[lang]) in sub_entry_lang_words:
                continue  # Skip - it will be processed as a full entry
            
            # Otherwise create reflex as before
            reflex_id = generate_id(lang)
            meaning_field = f'{lang} meaning'
            derivates_field = f'{lang} derivates'
            reflex_meaning = entry.get(meaning_field, head_meaning)
            
            reflex_data = {
                'ID': reflex_id,
                'Headword': entry[lang],
                'Meaning': reflex_meaning,
                'Language': lang,
                'Parent Word ID': current_id,
                'Parent Word': head_word,
                'Parent Language': head_lang,
                'Notes': '',
                'Number in DED': entry.get('Number in DED', ''),
                'Number in CVOTGD': '',
                'URL': entry.get('_url', ''),
                'Depth': entry.get('_depth', 0)
            }
            
            if derivates_field in entry:
                reflex_data['Derivates'] = entry[derivates_field]
            
            rows.append(reflex_data)
    
    # Process sub-entries
    if '_sub_entries' in entry:
        for sub_entry in entry['_sub_entries']:
            extract_head_and_reflexes(sub_entry, current_id, head_word, head_lang)
            
for record in data:
    extract_head_and_reflexes(record)
    
df = pd.DataFrame(rows)
df.to_excel('output.xlsx', index=False, engine='openpyxl')
# testing below to see
# extract_head_and_reflexes(data[0])
# df = pd.DataFrame(rows)
# df