# Bases de datos sobre lenguas de Sudamérica

## A. SAILS
### https://sails.clld.org/

### Leemos los datos desde https://github.com/cldf-datasets/sails

#### Lenguas en SAILS

In [1]:
## usemos pandas

import pandas as pd

In [2]:
## lenguas

languages = pd.read_csv('languages.csv',sep=',')

In [3]:
languages

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code
0,NOCODE_Apolista,Apolista,,-14.830000,-68.660000,apol1242,
1,NOCODE_Maipure,Maipure,,4.800000,-67.500000,maip1246,
2,aca,Achagua,,4.386490,-72.200500,acha1250,aca
3,acu,Achuar-Shiwiar,,-2.826455,-77.264050,achu1248,acu
4,agr,Aguaruna,,-5.300440,-77.921790,agua1253,agr
...,...,...,...,...,...,...,...
162,yrl,Nhengatu,,1.045027,-66.964553,nhen1239,yrl
163,yuq,Yuqui,,-16.787520,-64.943820,yuqu1240,yuq
164,yuz,Yuracaré,,-16.747880,-65.122410,yura1255,yuz
165,yvt,Yavitero,,2.800281,-68.084219,yavi1244,yvt


In [4]:
## diccionario glottocode: [name,latitude,longitude]

glotto_geo = languages[['ID','Glottocode','Name','Latitude','Longitude']]

In [5]:
## transformamos a diccionario

id_iso = list(glotto_geo['ID'])
glotto = list(glotto_geo['Glottocode'])
names = list(glotto_geo['Name'])
lat = list(glotto_geo['Latitude'])
lon = list(glotto_geo['Longitude'])

glotto_geo_pairs = []

for i in range(len(glotto)):
    glotto_geo_pairs += [[id_iso[i],glotto[i],names[i],lat[i],lon[i]]]

glotto_geo = {item[0]:[item[1],item[2],item[3],item[4]] for item in glotto_geo_pairs}

In [6]:
glotto_geo

{'NOCODE_Apolista': ['apol1242', 'Apolista', -14.83, -68.66],
 'NOCODE_Maipure': ['maip1246', 'Maipure', 4.8, -67.5],
 'aca': ['acha1250', 'Achagua', 4.38649, -72.2005],
 'acu': ['achu1248', 'Achuar-Shiwiar', -2.8264549999999997, -77.26405],
 'agr': ['agua1253', 'Aguaruna', -5.30044, -77.92179],
 'ame': ['yane1238', "Yanesha'", -10.54539, -75.31497],
 'amr': ['amar1274', 'Amarakaeri', -12.49308, -70.55333],
 'apu': ['apur1254', 'Apurinã', -8.21692, -66.77141],
 'apy': ['apal1257', 'Apalaí', 1.4979200000000001, -54.7457],
 'arh': ['arhu1242', 'Arhuaco', 10.701839999999999, -73.63629],
 'arl': ['arab1268', 'Arabela', -1.9612599999999998, -75.06116999999999],
 'arn': ['mapu1245', 'Mapudungun', -38.73917, -71.27701],
 'arr': ['karo1305', 'Karo (Brazil)', -10.33424, -61.56908000000001],
 'arw': ['araw1276', 'Lokono', 5.6429525, -56.924344999999995],
 'auc': ['waor1240', 'Waorani', -1.1231200000000001, -76.68295],
 'awe': ['awet1244', 'Awetí', -12.933580000000001, -53.585190000000004],
 'ayc

In [7]:
## número de lenguas en SAILS

len(glotto_geo)

167

#### Rasgos en SAILS

In [8]:
## rasgos

features = pd.read_csv('values.csv',sep=',')

In [9]:
## filtramos el dataframe features
## Language_ID: identificador lengua, código iso + códigos de SAILS cuando no hay iso
## Parameter_ID: nombre del rasgos
## Value: valor del rasgos

features = features[['Language_ID','Parameter_ID','Value']]

In [14]:
## número de rasgos en SAILS

feature_names = set(features['Parameter_ID'])
print(len(feature_names))

604


In [10]:
## lenguas de SAILS

languages_sails = list(glotto_geo.keys())

In [11]:
## diccionario para guardar los datos
## D = language_ID:[Paramater_ID:Value,...]

D = {lengua:{} for lengua in languages_sails}

In [12]:
## recorremos el dataframe features

## recorremos usando loc :)
for i in list(features.index):
    D[features.loc[i,'Language_ID']][features.loc[i,'Parameter_ID']]=features.loc[i,'Value']

In [16]:
## promedio de rasgos x lengua en SAILS
import numpy as np

feature_values = list(D.values())
feature_values_mean = np.mean([len(L) for L in feature_values])
print(feature_values_mean)

190.37125748502993
