Setup and Imports

In [166]:
import json
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict
from typing import Dict, List, Set, Any
import re
from pprint import pprint

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 100)


In [167]:
JSON_FILE = 'starling_complete_data.json'

print(f"Loading: {JSON_FILE}")

with open(JSON_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"\nTop-level keys: {list(data.keys())}")

Loading: starling_complete_data.json

Top-level keys: ['metadata', 'records']


In [168]:
print("=" * 70)
print("METADATA")
print("=" * 70)

metadata = data.get('metadata', {})
for key, value in metadata.items():
    print(f"{key:.<30} {value}")

records = data.get('records', [])
print(f"\nTotal records in JSON: {len(records)}")

METADATA
start_url..................... https://starlingdb.org/cgi-bin/response.cgi?root=config&basename=\data\drav\dravet
total_pages_scraped........... 111
total_records................. 2211
unique_entries................ 11303

Total records in JSON: 2211


In [169]:
def analyze_all_keys(records: List[Dict]) -> Dict:
    """
    Recursively find ALL unique keys in the dataset, including nested sub_entries
    """
    all_keys = Counter()
    metadata_keys = set()
    language_keys = set()
    
    def extract_keys(obj, depth=0):
        """Recursively extract keys from nested structure"""
        if isinstance(obj, dict):
            for key, value in obj.items():
                all_keys[key] += 1
                
                # Classify key type
                if key.startswith('_'):
                    metadata_keys.add(key)
                elif key not in ['Meaning', 'Number in DED', 'Notes']:
                    language_keys.add(key)
                
                # Recurse into nested structures
                if isinstance(value, (dict, list)):
                    extract_keys(value, depth + 1)
        
        elif isinstance(obj, list):
            for item in obj:
                extract_keys(item, depth + 1)
    
    # Process all records
    for record in records:
        extract_keys(record)
    
    return {
        'all_keys': all_keys,
        'metadata_keys': sorted(metadata_keys),
        'language_keys': sorted(language_keys)
    }

# Run analysis
key_analysis = analyze_all_keys(records)

## Exporting json to separate language files.

Identify Base Language Names and Their Variants

In [170]:
from typing import Dict, List, DefaultDict, TypedDict
from collections import defaultdict, Counter


class LanguageInfo(TypedDict):
    has_meaning: bool
    has_derivates: bool
    has_etymology: bool
    count: int


class DisplayLanguageInfo(LanguageInfo):
    variants: List[str]


# --- Step 1: Extract structure from all_keys ---

def extract_language_structure(all_keys: Counter[str]) -> Dict[str, LanguageInfo]:
    """
    Identify base languages and their associated fields (meaning, derivates, etymology)
    """
    excluded_keys = {
        '_page', '_record_num', '_sub_entries', '_url', '_depth', '_content_hash', '_error',
        'Meaning', 'Notes', 'Number in DED', 'Number in CVOTGD',
        'Miscellaneous', 'Notes on correspondences', 'Stems',
        'Dialectal forms', 'Dialectal forms (1)', 'Dialectal forms (2)',
        'Dialectal forms (3)', 'Dialectal forms (4)',
        "Additional forms", "Additional Forms"
    }

    language_info: DefaultDict[str, LanguageInfo] = defaultdict(lambda: {
        'has_meaning': False,
        'has_derivates': False,
        'has_etymology': False,
        'count': 0
    })

    for key, count in all_keys.items():
        if key in excluded_keys:
            continue

        if key.endswith(' etymology'):
            continue

        if key.endswith(' meaning'):
            base_lang = key.removesuffix(' meaning')
            language_info[base_lang]['has_meaning'] = True
        elif key.endswith(' derivates'):
            base_lang = key.removesuffix(' derivates')
            language_info[base_lang]['has_derivates'] = True
        else:
            language_info[key]['count'] = count

    return dict(language_info)


# --- Step 2: Normalize for display (merge variants, fix naming inconsistencies) ---

def normalize_for_display(language_structure: Dict[str, LanguageInfo]) -> Dict[str, DisplayLanguageInfo]:
    """Merge similar language names for display"""
    normalized: DefaultDict[str, DisplayLanguageInfo] = defaultdict(lambda: {
        'has_meaning': False,
        'has_derivates': False,
        'has_etymology': False,
        'count': 0,
        'variants': []
    })

    for lang, info in language_structure.items():
        # Normalize formatting (replace dashes with spaces)
        canonical = lang.replace('-', ' ').strip()

        n = normalized[canonical]
        n['count'] += info['count']
        n['has_meaning'] = n['has_meaning'] or info['has_meaning']
        n['has_derivates'] = n['has_derivates'] or info['has_derivates']
        n['has_etymology'] = n['has_etymology'] or info['has_etymology']
        n['variants'].append(lang)

    return dict(normalized)


# --- Step 3: Run analysis and print summary ---

# Example usage:
language_structure = extract_language_structure(key_analysis["all_keys"])
language_structure_display = normalize_for_display(language_structure)

print("=" * 70)
print("LANGUAGE STRUCTURE ANALYSIS")
print("=" * 70)
print(f"\nTotal base languages identified: {len(language_structure_display)}\n")

sorted_languages = sorted(
    language_structure_display.items(),
    key=lambda x: x[1]["count"],
    reverse=True
)

print("Language Name                                   | Count  |")
print("-" * 60)
for lang, info in sorted_languages:
    merged_indicator = " [merged]" if len(info["variants"]) > 1 else ""
    print(f"{lang:<47} | {info['count']:>6,}{merged_indicator}")


LANGUAGE STRUCTURE ANALYSIS

Total base languages identified: 67

Language Name                                   | Count  |
------------------------------------------------------------
Proto South Dravidian                           |  3,563
Proto Telugu                                    |  3,011
Proto Dravidian                                 |  2,211
Proto Gondi Kui                                 |  2,154
Proto Kolami Gadba                              |  2,026
Proto Nilgiri                                   |  1,792
Proto Kui Kuwi                                  |  1,426
Tamil                                           |  1,425
Proto Gondi                                     |  1,414
Kannada                                         |  1,407
Telugu                                          |  1,339
Konda                                           |  1,266
Malayalam                                       |  1,207
Tulu                                            |  1,174
Proto North Drav

Define Common Fields and Metadata

In [171]:
# Common fields that should be checked in every record
COMMON_FIELDS = [
    'Meaning',
    'Notes',
    'Number in DED',
    'Number in CVOTGD',
    'Miscellaneous',
    'Notes on correspondences',
    'Stems',
    'Additional forms',
    "Additional Forms",
]

# Dialectal forms can be numbered
DIALECTAL_FIELDS = [
    'Dialectal forms',
    'Dialectal forms (1)',
    'Dialectal forms (2)',
    'Dialectal forms (3)',
    'Dialectal forms (4)',
]

# Metadata fields
METADATA_FIELDS = [
    '_depth',
    '_error',
    '_page',
    '_record_num',
    '_url',
]

print("Common fields to extract:", COMMON_FIELDS)
print("\nDialectal fields to extract:", DIALECTAL_FIELDS)
print("\nMetadata fields to extract:", METADATA_FIELDS)

Common fields to extract: ['Meaning', 'Notes', 'Number in DED', 'Number in CVOTGD', 'Miscellaneous', 'Notes on correspondences', 'Stems', 'Additional forms', 'Additional Forms']

Dialectal fields to extract: ['Dialectal forms', 'Dialectal forms (1)', 'Dialectal forms (2)', 'Dialectal forms (3)', 'Dialectal forms (4)']

Metadata fields to extract: ['_depth', '_error', '_page', '_record_num', '_url']


Function to Flatten Records for a Specific Language

In [172]:

def get_language_variants(language_name: str, language_structure_display: Dict = None) -> List[str]:
    """
    Return list of variant names to check for a given language.
    """
    # If we have the display structure, use the stored variants
    if language_structure_display and language_name in language_structure_display:
        return language_structure_display[language_name]['variants']
    
    # Fallback: check both space and dash versions
    variants = [language_name]
    if ' ' in language_name:
        variants.append(language_name.replace(' ', '-'))
    if '-' in language_name:
        variants.append(language_name.replace('-', ' '))
    
    return variants
    
# Cell: Fixed Extract Function - Handle Non-String Values
def extract_language_entries(records: List[Dict], language_name: str, language_structure_display: Dict = None) -> List[Dict]:
    """
    Extract all entries for a language, checking variant names if needed.
    ALL etymology fields go into the single 'Etymology' column.
    """
    variant_names = get_language_variants(language_name, language_structure_display)
    entries = []
    
    def process_record(record: Dict, parent_meaning: str = ""):
        if record.get('_circular_reference'):
            return
        
        # Check all variant names
        word = ""
        for variant in variant_names:
            word = record.get(variant, "").strip() if isinstance(record.get(variant, ""), str) else ""
            if word:
                break
        
        if word:
            entry = {'Word': word}
            
            # Try to get meaning from any variant
            specific_meaning = ""
            for variant in variant_names:
                val = record.get(f"{variant} meaning", "")
                specific_meaning = val.strip() if isinstance(val, str) else str(val)
                if specific_meaning:
                    break
            
            general_meaning = record.get("Meaning", "")
            general_meaning = general_meaning.strip() if isinstance(general_meaning, str) else str(general_meaning)
            entry['Meaning'] = specific_meaning if specific_meaning else (general_meaning or parent_meaning)
            
            # Derivates
            entry['Derivates'] = ""
            for variant in variant_names:
                val = record.get(f"{variant} derivates", "")
                derivates = val.strip() if isinstance(val, str) else str(val) if val else ""
                if derivates:
                    entry['Derivates'] = derivates
                    break
            
            # Etymology - check language-specific first, then ANY etymology field
            etymology = ""
            
            # 1. Check language-specific etymology (e.g., "Tamil etymology")
            for variant in variant_names:
                val = record.get(f"{variant} etymology", "")
                etymology = val.strip() if isinstance(val, str) else str(val) if val else ""
                if etymology:
                    break
            
            # 2. If not found, check for ANY field ending with ' etymology'
            if not etymology:
                for key, value in record.items():
                    if key.endswith(' etymology') and value:
                        etymology = str(value).strip() if isinstance(value, str) else str(value)
                        if etymology:
                            break
            
            entry['Etymology'] = etymology
            
            # Common fields - handle non-string values
            for field in COMMON_FIELDS:
                val = record.get(field, "")
                if isinstance(val, str):
                    entry[field] = val.strip()
                elif val:
                    entry[field] = str(val)
                else:
                    entry[field] = ""
            
            # Dialectal forms - handle non-string values
            for field in DIALECTAL_FIELDS:
                val = record.get(field, "")
                if isinstance(val, str):
                    entry[field] = val.strip()
                elif val:
                    entry[field] = str(val)
                else:
                    entry[field] = ""
            
            # Metadata - keep as-is (might be int)
            for field in METADATA_FIELDS:
                entry[field] = record.get(field, "")
            
            entries.append(entry)
        
        # Recurse
        current_meaning = record.get("Meaning", "")
        current_meaning = current_meaning if isinstance(current_meaning, str) else str(current_meaning) if current_meaning else ""
        if not current_meaning:
            current_meaning = parent_meaning
            
        if '_sub_entries' in record:
            for sub_entry in record['_sub_entries']:
                if isinstance(sub_entry, dict):
                    process_record(sub_entry, current_meaning)
    
    for record in records:
        process_record(record)
    
    return entries


Get List of All Languages to Export

In [173]:
# Get all base language names (those that have actual word entries, not just etymology references)
languages_to_export = []

for lang_name, info in language_structure_display.items():
    # Only export if it has actual entries (count > 0)
    if info['count'] > 0:
        languages_to_export.append(lang_name)
        

languages_to_export.sort()

print("=" * 70)
print(f"LANGUAGES TO EXPORT: {len(languages_to_export)}")
print("=" * 70)

for i, lang in enumerate(languages_to_export, 1):
    # Use the display structure for counts and flags
    info = language_structure_display[lang]
    count = info['count']
    has_meaning = "M" if info['has_meaning'] else "-"
    has_derivates = "D" if info['has_derivates'] else "-"
    has_etymology = "E" if info['has_etymology'] else "-"
   
    merged_indicator = " [merged]" if len(info['variants']) > 1 else ""
    print(f"{i:>3}. {lang:<45} [{has_meaning}{has_derivates}{has_etymology}] {count:>6,} entries{merged_indicator}")

LANGUAGES TO EXPORT: 67
  1. Adilabad Gondi                                [---]    305 entries
  2. Betul Gondi                                   [---]    343 entries
  3. Brahui                                        [---]    371 entries
  4. Chanda Gondi                                  [---]     36 entries
  5. Chindwara Gondi                               [---]     90 entries
  6. Dongriya Kuwi                                 [---]     32 entries
  7. Durg Gondi                                    [---]     38 entries
  8. Gommu Gondi                                   [---]    160 entries
  9. Inscriptional Telugu                          [---]     45 entries
 10. Irula                                         [MD-]     12 entries
 11. Kannada                                       [MD-]  1,407 entries
 12. Kasaba                                        [MD-]      2 entries
 13. Khuttia Kui                                   [---]     86 entries
 14. Kinwat Kolami                      

Export Function

In [174]:
def export_language_to_csv(records: List[Dict],
                          language_name: str,
                          output_dir: Path,
                          language_structure_display: Dict = None) -> Dict:
    """
    Export using the fixed extract function.
    """
    entries = extract_language_entries(records, language_name, language_structure_display)
    
    if not entries:
        return {'language': language_name, 'filename': None, 'entries': 0, 'status': 'no_entries'}
    
    df = pd.DataFrame(entries)
    
    # Reorder columns
    priority_cols = ['Word', 'Meaning', 'Derivates', 'Etymology']
    other_cols = [col for col in df.columns if col not in priority_cols]
    ordered_cols = priority_cols + sorted(other_cols)
    df = df[ordered_cols]
    
    # Filename
    safe_name = re.sub(r'[^a-zA-Z0-9_()-]', '_', language_name)
    safe_name = re.sub(r'_+', '_', safe_name)
    filename = f"{safe_name}.csv"
    filepath = output_dir / filename
    
    df.to_csv(filepath, index=False, encoding='utf-8-sig')
    
    return {
        'language': language_name,
        'filename': filename,
        'entries': len(entries),
        'unique_words': df['Word'].nunique(),
        'with_meaning': (df['Meaning'] != '').sum(),
        'status': 'success'
    }


Full Export

In [175]:
# Uncomment to run full export

output_dir = Path('dravidlex_csv_output')
output_dir.mkdir(exist_ok=True)

print("=" * 70)
print("FULL EXPORT - ALL LANGUAGES")
print("=" * 70)
print(f"Exporting {len(languages_to_export)} languages...\n")

export_results = []

for i, lang in enumerate(languages_to_export, 1):
    print(f"[{i}/{len(languages_to_export)}] Exporting {lang}...", end=' ')
    
    result = export_language_to_csv(records, lang, output_dir, language_structure_display)
    export_results.append(result)
    
    if result['status'] == 'success':
        print(f"OK ({result['entries']} entries)")
    else:
        print(f"SKIP (no entries)")

# Create summary
summary_df = pd.DataFrame(export_results)
summary_df = summary_df.sort_values('entries', ascending=False)
summary_path = output_dir / '_EXPORT_SUMMARY.csv'
summary_df.to_csv(summary_path, index=False, encoding='utf-8-sig')

print("\n" + "=" * 70)
print("EXPORT COMPLETE")
print("=" * 70)
print(f"Total files created: {(summary_df['status'] == 'success').sum()}")
print(f"Total entries exported: {summary_df['entries'].sum():,}")
print(f"Summary saved to: {summary_path}")

print("Full export code ready (currently commented out)")
print("Uncomment the code above to run full export after verifying test results")

FULL EXPORT - ALL LANGUAGES
Exporting 67 languages...

[1/67] Exporting Adilabad Gondi... OK (305 entries)
[2/67] Exporting Betul Gondi... OK (343 entries)
[3/67] Exporting Brahui... OK (371 entries)
[4/67] Exporting Chanda Gondi... OK (36 entries)
[5/67] Exporting Chindwara Gondi... OK (90 entries)
[6/67] Exporting Dongriya Kuwi... OK (32 entries)
[7/67] Exporting Durg Gondi... OK (38 entries)
[8/67] Exporting Gommu Gondi... OK (160 entries)
[9/67] Exporting Inscriptional Telugu... OK (45 entries)
[10/67] Exporting Irula... OK (12 entries)
[11/67] Exporting Kannada... OK (1407 entries)
[12/67] Exporting Kasaba... OK (2 entries)
[13/67] Exporting Khuttia Kui... OK (86 entries)
[14/67] Exporting Kinwat Kolami... OK (159 entries)
[15/67] Exporting Kodagu... OK (660 entries)
[16/67] Exporting Kolami... OK (417 entries)
[17/67] Exporting Kolami (Setumadhava Rao)... OK (146 entries)
[18/67] Exporting Konda... OK (1266 entries)
[19/67] Exporting Konda (Burrow/Bhattacharya)... OK (67 entries)