# Networks and Laplacian Spectrum: a case study on the Arawakan linguistic family 

## (1) Lenguas de la Américas
### Extraemos las lenguas de América del Sur y de América del Norte desde los datos de Glottolog
### https://glottolog.org/meta/downloads

In [17]:
## usamos pandas para leer .csv

import pandas as pd 

In [18]:
## clasificación por macroárea 

macroarea = pd.read_csv('languages_and_dialects_geo.csv',sep=',')

In [19]:
## borramos donde no hay datos (NaN)

macroarea.dropna(inplace = True)

In [20]:
## nos quedamos con un dataframe con dos columnas: iso vs macroárea

macroarea = macroarea[['isocodes','macroarea']]

In [21]:
## transformamos los datos en un diccionario iso:macroarea
## filtramos por macroarea 'South America' y 'North America'

macroarea = dict(zip(macroarea['isocodes'], macroarea['macroarea']))
macroarea = {language:macroarea[language] for language in macroarea.keys() if macroarea[language] in ['South America','North America']}

In [22]:
## ¿cuántas lenguas tenemos?

print('número de lenguas: '+str(len(macroarea.keys())))

número de lenguas: 1208


In [24]:
## de este conjunto de 1208 lenguas, filtramos las que están en el corpus UDHR
## https://www.unicode.org/udhr/index.html

languages={}

for language in macroarea.keys():
    #with open('udhr_' + language + '.txt', 'r', encoding='utf-8') as file:
    try: 
        file = open('udhr/'+'udhr_'+language+'.txt', 'r')  
        language_text=file.read().split('\n')
        languages[language]=[line.strip() for line in language_text]
        languages[language]=[line for line in languages[language] if len(line)>0]
    except FileNotFoundError:
        pass

In [25]:
## nuevo número de lenguas :)

print(len(languages))

85


In [27]:
## mapudungun :)
## solo imprimimos los primeros 10 elementos de la lista

languages['arn'][:10]

['Universal Declaration of Human Rights - Mapudungun',
 '© 1996 – 2009 The Office of the High Commissioner for Human Rights',
 'This plain text version prepared by the “UDHR in Unicode”',
 'project, https://www.unicode.org/udhr.',
 '---',
 'Kom Mapu Fijke Az Tañi Az Mogeleam',
 'Tuwvlzugun',
 '("Preámbulo" pi ta wigka)',
 'Kimnieel fij mapu mew tañi kimgen kvme felen kisugvnew felen xvr kvme mvlen. Tvfaci zugu ñi mvleken mvleyem yamvwvn ka xvr kvme nor felen kom pu reñmawke ce mew.',
 'Gewenonmu yamuwvn, zuamgewenonmu kvme felen, goymagenmu nor felen mvley re jazkvnkawvn: Fey mew mvley xvrvmzugu kom pu ce tañi kvme mogeleam kisuke ñi feyentun mew, kisu ñi rakizuam mew ka ñi wimtun mew ñi mvleal egvn.']

### Descripción del corpus

In [32]:
## dos funciones que permiten "limpiar" los datos con el fin de construir redes.
## la primera tokeniza strings. Usamos como definición de palabra como la cadena de caracteres entre dos espacios en blanco. 
## la segunda (i) elimina símbolos raros, elimina espacios vacíos, elimina números y remueve los encabezados

def tokenize(s):
    return s.split(' ')

def clean(L):
    language=languages[L]
    table = str.maketrans({key: None for key in '``!"#$%&\¿()*+,-./:;<=>?@[\\]_{|}'})
    language=[list(filter(None, [w.lower().translate(table) for w in tokenize(sentence)])) for sentence in language if len([w.lower() for w in [w.translate(table) for w in tokenize(sentence)]])>0]
    language=[[w for w in s if w!="''"] for s in language] 
    language=[[w for w in s if w!='̃'] for s in language] 
    language=[[w for w in s if not w.isdigit()] for s in language]
    if L=='zro':
        return language[6:]
    elif L=='tca':
        return language[7:]
    elif L=='gyr':
        return language[9:]
    else:
        return language[5:]

In [33]:
## aplicamos las funciones anteriores.

clean_languages={}
for language in languages:
    clean_languages[language]=clean(language)

In [34]:
## para cada ejemplo, mostramos cómo queda la primera oración. 

for language in clean_languages.keys():
    print(language, clean_languages[language][0])

acu ['kintati', 'diciembre', 'nantuti', 'musachtin', 'apu', 'ainau', 'iruntrar', 'tu', 'aarmi', 'tusar', 'nunia', 'aarar', 'mash', 'nungkanam', 'pujuinau', 'angkan', 'pengker', 'pujusarti', 'tusar', 'aararmiayi']
agr ['ashi', 'aents', 'yakat', 'muun', 'aidaunmaya', 'ijunjamunum', 'tsawan', 'de', 'diciembre', 'tin', 'etsejau', 'ainawai', 'juju', 'ashi', 'aents', 'aidaunum', 'uminkatin', 'ati', 'tusa', 'nunu', 'tsawantai', 'jintiajajui', 'chichaman', 'dutika', 'ashi', 'nugkanum', 'iwainaju', 'ainawai', 'aents', 'nii', 'anentaibau', 'nii', 'wakejamu', 'yupichu', 'dutikashmin', 'chichajamunum', 'ayamkagtaun', 'tuja', 'tiaju', 'ainawai', 'juju', 'chicham', 'pegkeg', 'jintiagmawa', 'juka', 'juwig', 'nagkankashti', 'ashi', 'nugkanum', 'dapampaejati', 'yakat', 'muun', 'aidaunum', 'yakat', 'piipich', 'aidaunmashkam', 'papi', 'aujtainmash', 'ashi', 'tutinjati', 'tiajui', 'tuja', 'papinum', 'agajag', 'sujuktinme', 'tiajui', 'ashi', 'aents', 'aidau', 'aujus', 'dekatnume', 'tusa', 'makichkish', 'ug

In [35]:
## contamos palabras 

from collections import Counter

words_entropy={}
words={}
mean_tokens={}
mean_types={}
for language in clean_languages.keys():
    words_entropy[language]=[item for sublist in clean_languages[language] for item in sublist]
    words[language]=[len([item for sublist in clean_languages[language] for item in sublist]),len(set([item for sublist in clean_languages[language] for item in sublist])),Counter([item for sublist in clean_languages[language] for item in sublist])]
    mean_tokens[language]=len([item for sublist in clean_languages[language] for item in sublist])
    mean_types[language]=len(set([item for sublist in clean_languages[language] for item in sublist]))
    print(language,len(clean_languages[language]),len([item for sublist in clean_languages[language] for item in sublist]),len(set([item for sublist in clean_languages[language] for item in sublist])))

acu 92 1676 559
agr 90 1811 827
amc 87 1496 672
amr 93 1407 421
qva 92 1715 762
arl 94 1973 530
qxu 94 1124 542
cni 92 1305 515
prq 75 626 258
kwi 202 1294 733
quy 93 1279 645
qug 239 1409 702
boa 96 1567 704
qvc 90 1153 599
qud 91 933 456
cbu 91 1290 574
cot 94 1532 715
hns 94 2221 490
cbr 92 2204 542
cbs 76 626 258
ayr 91 925 610
maz 91 1544 310
cbi 100 3120 959
cic 93 554 301
csa 96 1699 425
qxa 90 1477 738
zam 95 1409 227
quz 94 1128 664
ike 70 857 620
ese 68 763 529
cab 94 1330 476
gyr 78 1159 459
ztu 92 1342 326
qvh 90 1934 850
hus 94 1554 335
qwh 91 1326 606
mzi 99 1250 570
kal 91 1039 682
cak 90 2514 454
kek 91 1520 407
quc 90 2171 501
mam 91 1527 440
arn 91 1712 390
qvm 92 1373 642
mcf 95 2373 600
mxv 94 1336 396
ote 91 1819 317
mic 91 1330 573
miq 85 1634 394
huu 91 1294 385
nav 91 1601 779
not 91 1100 383
qxn 89 1604 668
qvn 90 1346 680
ojb 93 1126 587
chj 91 1683 423
pbb 90 1467 700
top 92 1554 571
pap 90 1878 480
gug 83 1154 576
cpu 92 2352 862
ppl 91 1856 354
tsz 91 1279 

## (2) Laplacian Spectrum
### usamos redes para extraer rasgos de lenguas de las Américas. Nuestros rasgos son valores propios 
### https://en.wikipedia.org/wiki/Eigenvalues_and_eigenvectors