# Chibchan Ling-Py intents

In [3]:
from __future__ import unicode_literals, print_function, division
from lingpy import *
from collections import defaultdict
from lingpy.sequence.sound_classes import check_tokens
import os


## Loading the worldlist

In [21]:
wl = Wordlist("../wordlist.tsv")
print(
    "Wordlist has {0} languages and {1} concepts across {2} rows.".format(
        wl.width, wl.height, len(wl)))



Wordlist has 25 languages and 110 concepts across 2631 rows.


## error checking

In [22]:
errors = defaultdict(int)
for idx, tks in wl.iter_rows('tokens'):
    for error in check_tokens(tks):
        errors[error[1]] += 1
print(len(errors))



0


## mutual coverage

In [23]:
from lingpy.compare.util import (
    mutual_coverage_check, mutual_coverage_subset)
for i in range(210, 0, -1):
    if mutual_coverage_check(wl, i):
        print(
            "Minimal mutual coverage is at {0} concept pairs.".format(i))
        break

Minimal mutual coverage is at 2 concept pairs.


## Average Mutual Coverage

In [26]:
from lingpy.compare.sanity import average_coverage
print('{0:.2f}'.format(average_coverage(wl)))

0.91


In [18]:
chibchan = ['Arhuaco', 'Teribe', 'SanBlasKuna', 'Rama', 'Pech', 'Ngabere', 
            'MalekuJaika', 'Malayo', 'Dorasque', 'Cogui', 'Chimila', 'Chibcha',
            'CentralTunebo', 'Cabecar', 'Buglere', 'Bribri', 'Boruca', 'Bari']
misumalpam = ['LencaSalvador', 'LencaHonduras', 'Cacaopera', 'Mayangna', 'Ulwa', 'Miskito']
# atanque_ not enough data

wl.output('tsv', filename='chibchan', subset=True,
            rows=dict(doculect = 'in '+str(chibchan)))

2021-02-27 20:20:24,813 [INFO] Data has been written to file <chibchan.tsv>.


## Cognate identification
This process automatically finds cognates and calculates the alignments. Comparison with the judgement of CU2005 will be interesting.

In [20]:
lex = LexStat('chibchan.tsv', check=True, segments='tokens') # error-check

lex.get_scorer(runs=10000)
lex.output('tsv', filename='chibchan.bin', ignore=[])
lex.cluster(method="lexstat", threshold=0.55, ref="infomap", cluster_method='infomap')
lex.output('tsv', filename='chibchan-lexstat')

2021-02-27 20:21:02,227 [INFO] No obvious errors found in the data.
CORRESPONDENCE CALCULATION:   0%|          | 0/162.0 [00:00<?, ?it/s]2021-02-27 20:21:02,442 [INFO] Calculating alignments for pair Arhuaco / Arhuaco.
2021-02-27 20:21:02,454 [INFO] Calculating alignments for pair Arhuaco / Bari.
2021-02-27 20:21:02,465 [INFO] Calculating alignments for pair Arhuaco / Boruca.
2021-02-27 20:21:02,484 [INFO] Calculating alignments for pair Arhuaco / Bribri.
2021-02-27 20:21:02,494 [INFO] Calculating alignments for pair Arhuaco / Buglere.
2021-02-27 20:21:02,505 [INFO] Calculating alignments for pair Arhuaco / Cabecar.
2021-02-27 20:21:02,516 [INFO] Calculating alignments for pair Arhuaco / CentralTunebo.
2021-02-27 20:21:02,528 [INFO] Calculating alignments for pair Arhuaco / Chibcha.
2021-02-27 20:21:02,540 [INFO] Calculating alignments for pair Arhuaco / Chimila.
CORRESPONDENCE CALCULATION:   6%|▌         | 10/162.0 [00:00<00:01, 89.29it/s]2021-02-27 20:21:02,555 [INFO] Calculating ali

Column <infomap> already exists, do you want to override? [y/N]  y


2021-02-27 20:25:18,800 [INFO] Data has been written to file <chibchan-lexstat.tsv>.


## Alignments

In [21]:
alm = Alignments('chibchan-lexstat.tsv', ref='infomap', segments='tokens') # `ref` indicates the column with the cognate sets
alm.align(method='progressive', scoredict=lex.cscorer)
alm.output('tsv', filename='../chibchan-aligned', ignore='all', prettify=False)

2021-02-27 20:27:27,629 [INFO] Data has been written to file <../chibchan-aligned.tsv>.


In [22]:
# The nexus import seems not to be able to find th einfomap column
from lingpy.convert.strings import write_nexus
nexus = write_nexus(wl, ref="infomap", mode="beast", filename='chibchan-beast.nex')

2021-02-27 20:27:46,868 [INFO] Data has been written to file <chibchan-beast.nex>.


In [23]:
import io
from lingpy.convert.strings import matrix2dst

wl = Wordlist('chibchan-lexstat.tsv')
dst = matrix2dst(wl.get_distances(ref='infomap', mode='swadesh'), wl.taxa)

with io.open('chibchan.dst', 'w', encoding='utf8') as fp:
    fp.write(dst)

2021-02-27 20:28:17,894 [INFO] Successfully calculated dst.
