# Chibchan Ling-Py intents

In [1]:
from __future__ import unicode_literals, print_function, division
from lingpy import *
from collections import defaultdict
from lingpy.sequence.sound_classes import check_tokens
import os


## Loading the worldlist

In [2]:
wl = Wordlist("./AutoCognates/wordlist.tsv")
print(
    "Wordlist has {0} languages and {1} concepts across {2} rows.".format(
        wl.width, wl.height, len(wl)))

Wordlist has 25 languages and 110 concepts across 2631 rows.


## error checking

In [3]:
errors = defaultdict(int)
for idx, tks in wl.iter_rows('tokens'):
    for error in check_tokens(tks):
        errors[error[1]] += 1
print(len(errors))

0


## mutual coverage

In [4]:
from lingpy.compare.util import (
    mutual_coverage_check, mutual_coverage_subset)
for i in range(210, 0, -1):
    if mutual_coverage_check(wl, i):
        print(
            "Minimal mutual coverage is at {0} concept pairs.".format(i))
        break

Minimal mutual coverage is at 2 concept pairs.


## Average Mutual Coverage

In [5]:
from lingpy.compare.sanity import average_coverage
print('{0:.2f}'.format(average_coverage(wl)))

0.91


In [None]:
# chibchan = ['Arhuaco', 'Teribe', 'SanBlasKuna', 'Rama', 'Pech', 'Ngabere', 
            #'MalekuJaika', 'Malayo', 'Dorasque', 'Cogui', 'Chimila', 'Chibcha',
            #'CentralTunebo', 'Cabecar', 'Buglere', 'Bribri', 'Boruca', 'Bari']
# misumalpam = ['LencaSalvador', 'LencaHonduras', 'Cacaopera', 'Mayangna', 'Ulwa', 'Miskito']
# atanque_ not enough data

# wl.output('tsv', filename='chibchan', subset=True,
            #rows=dict(doculect = 'in '+str(chibchan)))

## Cognate identification
This process automatically finds cognates and calculates the alignments. Comparison with the judgement of CU2005 will be interesting.

In [6]:
lex = LexStat('./AutoCognates/wordlist.tsv', check=True, segments='tokens') # error-check

lex.get_scorer(runs=10000)
lex.output('tsv', filename='./AutoCognates/chibchan.bin', ignore=[])
lex.cluster(method="lexstat", threshold=0.55, ref="infomap", cluster_method='infomap')
lex.output('tsv', filename='./AutoCognates/chibchan-lexstat')

2021-04-06 14:27:15,608 [INFO] No obvious errors found in the data.
CORRESPONDENCE CALCULATION:   0%|          | 0/312.5 [00:00<?, ?it/s]2021-04-06 14:27:16,294 [INFO] Calculating alignments for pair Arhuaco / Arhuaco.
2021-04-06 14:27:16,312 [INFO] Calculating alignments for pair Arhuaco / Atanque.
2021-04-06 14:27:16,313 [INFO] Calculating alignments for pair Arhuaco / Bari.
2021-04-06 14:27:16,326 [INFO] Calculating alignments for pair Arhuaco / Boruca.
2021-04-06 14:27:16,341 [INFO] Calculating alignments for pair Arhuaco / Bribri.
2021-04-06 14:27:16,356 [INFO] Calculating alignments for pair Arhuaco / Buglere.
2021-04-06 14:27:16,371 [INFO] Calculating alignments for pair Arhuaco / Cabecar.
2021-04-06 14:27:16,385 [INFO] Calculating alignments for pair Arhuaco / Cacaopera.
CORRESPONDENCE CALCULATION:   3%|▎         | 9/312.5 [00:00<00:03, 85.07it/s]2021-04-06 14:27:16,400 [INFO] Calculating alignments for pair Arhuaco / CentralTunebo.
2021-04-06 14:27:16,416 [INFO] Calculating al

## Alignments

In [9]:
alm = Alignments('./AutoCognates/chibchan-lexstat.tsv', ref='infomap', segments='tokens') # `ref` indicates the column with the cognate sets
alm.align(method='progressive', scoredict=lex.cscorer)
alm.output('tsv', filename='./AutoCognates/chibchan-auto', ignore='all', prettify=False)

2021-04-06 15:25:20,908 [INFO] Data has been written to file <AutoCognates/chibchan-auto.tsv>.


In [12]:
# The nexus import seems not to be able to find th einfomap column
from lingpy.convert.strings import write_nexus

wl = Wordlist("./AutoCognates/chibchan-lexstat.tsv")
nexus = write_nexus(wl, ref="INFOMAP", mode="beast", filename='./AutoCognates/chibchan-beast_ls.nex')

2021-04-06 15:28:01,368 [INFO] Data has been written to file <AutoCognates/chibchan-beast_ls.nex>.


In [13]:
import io
from lingpy.convert.strings import matrix2dst

dst = matrix2dst(wl.get_distances(ref='infomap', mode='swadesh'), wl.taxa)

with io.open('./AutoCognates/chibchan.dst', 'w', encoding='utf8') as fp:
    fp.write(dst)

2021-04-06 15:28:16,541 [INFO] Successfully calculated dst.
