# Process a language tab to yaml v. 2.0

This notebook collects language tabs from the SAPhon [Tupian Nasal Typology Input](https://docs.google.com/spreadsheets/d/1dvXFvLIV4y84CglgjAl-ZVb09IuGazs1SzFO_UJpmnI/edit#gid=1164878023) spreadsheet and creates version 2.0 yaml output.

In [None]:
import spreadsheet
import os, re, sys
import requests
from pathlib import Path
import pandas as pd
import yaml
import json

downloads = Path.home() / 'Downloads'
langdir = Path('./newlangs/')
langdir.mkdir(parents=True, exist_ok=True)
yamldir = Path('../langs')

## Get Tupian input spreadsheet lang tabs

Collect the language tabs from the input spreadsheet into a dataframe, one row per lang tab.

In [None]:
ssdf = pd.DataFrame.from_records(list(spreadsheet.langsheets.values()))
ssdf['tabname'] = list(spreadsheet.langsheets.keys())
ssdf['yaml'] = ssdf['short'] + '.yaml'
ssdf = ssdf[ssdf['include']].reset_index(drop=True).drop('include', axis='columns')
assert(~ssdf['gid'].duplicated().any())
assert(~ssdf['tabname'].duplicated().any())
ssdf

## Download `.tsv` files (optional)

The next cell is optional to download all lang tabs from the spreadsheet. Set `do_download` to `True` and execute the cell to do this task. For active work on a lang tab this step is not necessary and time-consuming.

In [None]:
do_download = False
if do_download:
    for row in ssdf[ssdf['include']].itertuples():
        r = requests.get(f'{spreadsheet.puburl}/pub?gid={row.gid}&single=true&output=tsv')
        r.encoding = 'utf-8'
        with open(langdir / f'{row.short}.tsv', 'w', encoding='utf-8') as out:
            out.write(r.text)

## Function definitions

Functions used to create version 2.0 yaml, work in progress.

In [None]:
name = 'lowering'
procname = name if not '-' in name else name[name.index('-')+1:]
procname

In [None]:
langre = re.compile(
    r'''
    (?P<name>[^\[]+)
    (?P<iso>\[[^\]]+\])?
    ''',
    re.VERBOSE
)

def _clean(s):
    '''
    Clean string of extraneous markup.
    '''
    if s != None:
        s = s.strip().strip('{').strip('}').strip('[').strip(']').strip()
    return s

def alloprocs(allos, procs, phoneme):
    '''
    Return zipped allophones and processes extracted from allophone list.
    '''
    allolist = [_clean(a) for a in allos.split(',')]
    proclist = [_clean(p) for p in procs.split(',')]
    if len(proclist) != len(allolist):
        try:
            proclist.insert(allolist.index(phoneme), None)
        except ValueError:
            sys.stderr.write(f'Cannot find identity phone {phoneme} in allophone list {allolist} for proc list {proclist}\n')
        for allo, proc in zip(allolist, proclist):
            if proc is None:
                continue
            m = re.match(spreadsheet.procre, proc)
            try:
                phone = m.group('phone').strip('-')
            except Exception as e:
                sys.stderr.write(f"Problem parsing proc '{proc}': {e}\nGot {m.groupdict()}")
                continue
            try:
                assert(phone == allo)
            except AssertionError:
                sys.stderr.write(f'Proc phone {phone} does not match allophone {allo}\n')
    return zip(allolist, proclist)

def tsv2newyaml(tsvfile):
    '''
    Make a new YAML dict from a Tupian input spreadsheet tab.
    '''
    tsvlang = spreadsheet.read_lang(tsvfile)
    natclasses, flatnatclasses, catsymb = spreadsheet.check_natclasses(tsvlang)
    allophones, alloprocs = spreadsheet.check_allophones(tsvlang, flatnatclasses)
    morph_id_map = spreadsheet.check_morpheme_ids(tsvlang)
    spreadsheet.check_procs(tsvlang, flatnatclasses, morph_id_map, catsymb, alloprocs)
    # TODO: remainder should be per-doc (synthesis, ref)
    doc = tsvlang['synthesis']
    langm = re.match(langre, doc['lang'])
    name = langm.groupdict()['name'].strip()
    try:
        iso_codes = [_clean(c) for c in langm.groupdict()['iso'].split(',')]
    except:
        iso_codes = []
    try:
        notes = doc['notes']
    except KeyError:
        notes = 'None'
    sdoc = {
        'doctype': 'synthesis',
        'name': _clean(langm.groupdict()['name']),
#        'glottolog_name': v1['name'], # TODO: new, check by hand
#        'short_name': v1['short_name'],
#        'alternate_names': v1['alternate_names'],
        'iso_codes': iso_codes,
        'synthesis': doc['synthesis'],
        'natural_classes': [{'symbol': nc[0], 'members': nc[1:]} for nc in natclasses['synthesis']],
#        'glottolog_codes': [], # TODO: new, need to be added by hand
#        'family': v1['family'],
#        'countries': v1['countries'],
#        'coordinates': v1['coordinates'],
#!        'natural_classes': [], # TODO: new
#!        'morphemes': [], # TODO: new?
        'phonemes': phonlist(allophones['synthesis']),
        'processes': proclist(doc['processes']), # TODO: new?
#!        'triggers': [], # TODO: new?
#!        'transparent': [], # TODO: new?, include?
#!        'opaque': [], # TODO: new?, include?
         # TODO: following from v1 and not mentioned in new YAML draft
#!        'allophones': v1['allophones'],
#!        'nasal_harmony': v1['nasal_harmony'],
#!        'tone': v1['tone'],
#!        'laryngeal_harmony': v1['laryngeal_harmony'],
        'notes': notes
    }
    # Filter None values out of list values.
#!    listflds = (
#!        'alternate_names', 'iso_codes', 'glottolog_codes', 
#!        'countries', 'coordinates', 'natural_classes',
#!        'morphemes', 'phonemes', 'processes',
#!        'triggers', 'notes'
#!    )
#!    for fld in listflds:
#!        sdoc[fld] = [v for v in sdoc[fld] if v is not None]
    return (sdoc, tsvlang, allophones)

def proclist(processes):
    '''
    Return a `processdetails` list from spreadsheet `processes` section.
    '''
    deets = []
    for proc in processes:
        procname = proc['proc_name'] if not '-' in proc['proc_name'] else proc['proc_name'][proc['proc_name'].index('-')+1:]
        deet = {
            'processname': procname,
            'processtype': proc['proc_type'],
            'description': proc['description'],
            'optionality': proc['optionality'],
            'directionality': proc['directionality'],
            'alternation_type': proc['alternation_type']
        }
        for fld in 'undergoers', 'triggers':
            if proc[fld] == 'NA':
                deet[fld] = [['NA'], {'type': 'TODO', 'positional_restrictions': 'TODO'}]
            elif isinstance(proc[fld], dict):
                data = proc[fld][fld]
            else:
                data = proc[fld][0][fld]
            try:
                deet[fld] = [
                    [_clean(f) for f in data.split(',')],
                    {
                        'type': 'TODO',
                        'positional_restrictions': 'TODO'
                    }
                ]
            except:
                print(f'failed for {fld}: "{proc[fld]}"')
        for new, old in ('transparent', 'transparencies'), ('opaque', 'opacities'):
            deet[new] = [[proc['transparencies'], {
                'type': 'segmental',
                'positional_restrictions': 'None'
            }]]
        deets.append(deet)
    return deets

def _clean_procname(p):
    if p is None:
        return 'TODO: got None'
    else:
        return _clean(p if '-' not in p else p[p.index('-')+1:])

def phonlist(allophones):
    '''
    Return a `phonemes` list from an `allophone` set.
    '''
    phonemes = {}
    for pset in allophones:
        d = {}
        if len(pset) == 5:
            seg = pset[0]
            d = {'phoneme': f'STRING MAPPING: "{pset}"'}
        elif len(pset) == 4:
            phoneme, allos, env, proc = pset
            envs = []
            for env in pset[2].split(','):
                if env == '@':
                    prec, foll = 'TODO:@', 'TODO:@' # TODO: check meaning of '@'
                else:
                    try:
                        prec, foll = env.split('_')
                    except:
                        # TODO: resolve how to handle this error
                        prec, foll = f'ERROR: env value: "{env}"', ''
                envs.append({
                    'preceding': prec,
                    'following': foll,
                    'processes': [
                        {
                            'processnames': [_clean_procname(p)],
                            'allophone': _clean(a)
                        } for a, p in alloprocs(allos, proc, phoneme)
                    ]
                })
        elif len(pset) == 2:
            phoneme, allos = pset
            envs = [{
                'preceding': '', # TODO: NA or other empty value here?
                'following': '', # TODO: NA or other empty value here?
                'processes': [
                    {
                        'processnames': [],
                        'allophone': _clean(a)
                    } for a in allos.split(',')
                ]
            }]
        else:
            sys.stderr.write(f'Unexpected phoneme set length for "{pset}".')
            continue
        try:
            phonemes[phoneme]['environments'] += envs
        except KeyError:
            phonemes[phoneme] = {'phoneme': phoneme, 'environments': envs}
    # TODO: sort phonemes
    return list(phonemes.values())

def yaml2newyaml(v1):
    '''
    Copy a version 1 YAML dict into version 2.
    '''
    sdoc = {
        'doctype': 'synthesis',
        'name': v1['name'],
        'glottolog_name': v1['name'], # TODO: new, check by hand
        'short_name': v1['short_name'],
        'alternate_names': v1['alternate_names'],
        'iso_codes': v1['iso_codes'],
        'glottolog_codes': [], # TODO: new, need to be added by hand
        'family': v1['family'],
        'countries': v1['countries'],
        'coordinates': v1['coordinates'],
        'natural_classes': [], # TODO: new
        'morphemes': [], # TODO: new?
        'phonemes': [{'phoneme': p} for p in v1['phonemes']],
        'processdetails': [], # TODO: new?
        'triggers': [], # TODO: new?
        'transparent': [], # TODO: new?, include?
        'opaque': [], # TODO: new?, include?
         # TODO: following from v1 and not mentioned in new YAML draft
        'allophones': v1['allophones'],
        'nasal_harmony': v1['nasal_harmony'],
        'tone': v1['tone'],
        'laryngeal_harmony': v1['laryngeal_harmony'],
        'notes': v1['notes'], # TODO: not mentioned in new YAML draft
    }
    # Filter None values out of list values.
    listflds = (
        'alternate_names', 'iso_codes', 'glottolog_codes', 
        'countries', 'coordinates', 'natural_classes',
        'morphemes', 'phonemes', 'processdetails',
        'triggers', 'notes'
    )
    for fld in listflds:
        sdoc[fld] = [v for v in sdoc[fld] if v is not None]
    return sdoc

TODO = 'TODO'

def ss2refdoc(lang):
    pass

def ss2synthdoc(lang):
    '''
    Produce a synthesis yaml doc from a lang from the input spreadsheet.
    '''
    synth = lang['synthesis']
    langm = re.match(langre, synth['lang'])
    yd = {
        'doctype': 'synthesis',
        'name': langm['name'].strip(),
        'short_name': TODO,
        'alternate_names': TODO,
        'iso_codes': langm['iso']
    }
    return yd

## Read `.tsv` and v1 `.yaml` files

Download, read, and process lang tabs (and existing v. 1 yaml files, if they exist). Set one or more tab indexes in `rng` to be checked for errors. Set `use_cached` to `True` if you want to use a previously-downloaded `.tsv` file instead of downloading from the input spreadsheet.

In [None]:
use_cached = True
langs = {}
#rng = [0, 2, 3, 5, 20, 29, 37, 40] # Indexes that have no errors so far
rng = [22]  # List of one or more tab indexes to process
for row in ssdf.iloc[rng].itertuples():
    if (yamldir / row.yaml).exists():
        with open(yamldir / row.yaml, 'r', encoding='utf-8') as fh:
            v1docs = list(yaml.safe_load_all(fh))
        v1synth = yaml2newyaml(v1docs[0])
    else:
        v1synth = {}
    
    tsvfile = langdir / f'{row.short}.tsv'
    if use_cached is not True or not tsvfile.exists():
        print(f"Requesting '{row.tabname}' lang tab from index {row.Index} and caching at {tsvfile}")
        r = requests.get(f'{spreadsheet.puburl}/pub?gid={row.gid}&single=true&output=tsv')
        r.encoding = 'utf-8'
        with open(tsvfile, 'w', encoding='utf-8') as out:
            # Replace Windows CRLF with Unix LF
            text = r.content.replace(b'\r\n', b'\n').decode('utf8')
            out.write(text)
    try:
        v2synth, tsvlang, allophones = tsv2newyaml(tsvfile)
    except Exception as e:
        v2synth = {}
        sys.stderr.write(f'ERROR: spreadsheet tab {row.tabname} failed.\n{e}')
        raise e
    langs[row.short] = {
        'v1synth': v1synth,
        'v2synth': v2synth,
        'tsv': tsvlang['synthesis']
    }


## Sample json dump

In [None]:
lang = 'Nhandeva'
with open(f'{lang}.json', 'w', encoding='utf8') as out:
    json.dump(langs[lang]['v2synth'], out, indent=2, ensure_ascii=False)
!cat Nhandeva.json