# Bases de datos sobre lenguas de Sudamérica

## A. SAILS
### https://sails.clld.org/

### Leemos los datos desde https://github.com/cldf-datasets/sails

#### Lenguas en SAILS

In [2]:
## usemos pandas

import pandas as pd

In [3]:
## lenguas

languages = pd.read_csv('languages.csv',sep=',')

In [4]:
languages

Unnamed: 0,ID,Name,Macroarea,Latitude,Longitude,Glottocode,ISO639P3code
0,NOCODE_Apolista,Apolista,,-14.830000,-68.660000,apol1242,
1,NOCODE_Maipure,Maipure,,4.800000,-67.500000,maip1246,
2,aca,Achagua,,4.386490,-72.200500,acha1250,aca
3,acu,Achuar-Shiwiar,,-2.826455,-77.264050,achu1248,acu
4,agr,Aguaruna,,-5.300440,-77.921790,agua1253,agr
...,...,...,...,...,...,...,...
162,yrl,Nhengatu,,1.045027,-66.964553,nhen1239,yrl
163,yuq,Yuqui,,-16.787520,-64.943820,yuqu1240,yuq
164,yuz,Yuracaré,,-16.747880,-65.122410,yura1255,yuz
165,yvt,Yavitero,,2.800281,-68.084219,yavi1244,yvt


In [5]:
## diccionario glottocode: [name,latitude,longitude]

glotto_geo = languages[['ID','Glottocode','Name','Latitude','Longitude']]

In [6]:
## transformamos a diccionario

id_iso = list(glotto_geo['ID'])
glotto = list(glotto_geo['Glottocode'])
names = list(glotto_geo['Name'])
lat = list(glotto_geo['Latitude'])
lon = list(glotto_geo['Longitude'])

glotto_geo_pairs = []

for i in range(len(glotto)):
    glotto_geo_pairs += [[id_iso[i],glotto[i],names[i],lat[i],lon[i]]]

glotto_geo = {item[0]:[item[1],item[2],item[3],item[4]] for item in glotto_geo_pairs}

In [7]:
## número de lenguas en SAILS

len(glotto_geo)

167

#### Mapa de las lenguas en SAILS

In [8]:
## mapa de las lenguas
## usemos lingtypology
## https://oneadder.github.io/lingtypology/html/index.html

!pip install lingtypology

Collecting lingtypology
  Downloading lingtypology-0.8.6-py3-none-any.whl (705 kB)
[K     |████████████████████████████████| 705 kB 12.0 MB/s eta 0:00:01     |██████████▊                     | 235 kB 12.0 MB/s eta 0:00:01
[?25hCollecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 19.6 MB/s eta 0:00:01
[?25hCollecting pyglottolog
  Downloading pyglottolog-3.2.2-py2.py3-none-any.whl (703 kB)
[K     |████████████████████████████████| 703 kB 64.5 MB/s eta 0:00:01
Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 426 kB/s  eta 0:00:01
[?25hCollecting colour
  Downloading colour-0.1.5-py2.py3-none-any.whl (23 kB)
Collecting branca
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Collecting csvw>=1.5.6
  Downloading csvw-1.8.0-py2.py3-none-any.whl (31 kB)
Collecting purl
  Downloading purl-1.5-py2.py3-none-any.whl (10 kB)
Collecting who

In [9]:
import lingtypology

glotto_languages = list(zip(*list(glotto_geo.values())))[0]

m = lingtypology.LingMap((glotto_languages),glottocode=True)
#m.add_features([unesco_aes[v] for v in list(all_dict.values())],radius=5, opacity=1,factor=('not endangered', 'threatened', 'shifting', 'moribund', 'nearly extinct', 'extinct'),colors=lingtypology.gradient(6, 'white', 'red'))
m.start_location = 'South America'
m.legend_title = 'Languages in SAILS'
m.create_map()

In [10]:
m.save('map_sails.html')

#### Rasgos en SAILS

In [11]:
## rasgos

features = pd.read_csv('values.csv',sep=',')

In [12]:
## filtramos el dataframe features
## Language_ID: identificador lengua, código iso + códigos de SAILS cuando no hay iso
## Parameter_ID: nombre del rasgos
## Value: valor del rasgos

features = features[['Language_ID','Parameter_ID','Value']]

In [13]:
## número de rasgos en SAILS

feature_names = set(features['Parameter_ID'])
print(len(feature_names))

604


In [14]:
## lenguas de SAILS

languages_sails = list(glotto_geo.keys())

In [15]:
## diccionario para guardar los datos
## D = language_ID:[Paramater_ID:Value,...]

D = {lengua:{} for lengua in languages_sails}

In [16]:
## recorremos el dataframe features

## recorremos usando loc :)
for i in list(features.index):
    D[features.loc[i,'Language_ID']][features.loc[i,'Parameter_ID']]=features.loc[i,'Value']

In [17]:
## promedio de rasgos x lengua en SAILS
import numpy as np

feature_values = list(D.values())
feature_values_mean = np.mean([len(L) for L in feature_values])
print(feature_values_mean)

190.37125748502993


In [18]:
## veamos los datos en un csv

pd.DataFrame.from_dict(D).to_csv('features_sails.csv')