# SAILS

## (1) Datos de Glottolog

In [1]:
## usamos pandas para leer los datos

import pandas as pd

In [2]:
## diccionario iso:glottocode
## https://glottolog.org/meta/downloads

langs = pd.read_csv('languages_and_dialects_geo.csv',sep=',')

In [3]:
## filtramos por lenguas de las Américas

macroarea = langs[['glottocode','macroarea']]
macroarea = macroarea.dropna()

In [4]:
## creamos un diccionario iso:macroarea
## filtramos por macroarea 'South America' y 'North America'

macroarea = dict(zip(macroarea['glottocode'], macroarea['macroarea']))
macroarea = {language:macroarea[language] for language in macroarea.keys() if macroarea[language] in ['South America','North America']}

In [5]:
print(len(macroarea))

2382


In [6]:
## familias de las Américas

In [11]:
## miramos languoids

languoid = pd.read_csv('languoid.csv',sep=',')

In [12]:
languoid

Unnamed: 0,id,family_id,parent_id,name,bookkeeping,level,latitude,longitude,iso639P3code,description,markup_description,child_family_count,child_language_count,child_dialect_count,country_ids
0,3adt1234,afro1255,nort3292,3Ad-Tekles,False,dialect,,,,,,0,0,0,
1,aala1237,aust1307,ramo1244,Aalawa,False,dialect,,,,,,0,0,0,
2,aant1238,nucl1709,nort2920,Aantantara,False,dialect,,,,,,0,0,0,
3,aari1238,sout2845,ahkk1235,Aari-Gayil,False,family,,,aiz,,,0,2,0,
4,aari1239,sout2845,aari1238,Aari,False,language,5.95034,36.5721,aiw,,,0,0,0,ET
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25434,zuti1239,tupi1275,guaj1255,Guajajára of Zutiua,False,dialect,,,,,,0,0,0,
25435,zuwa1238,koia1260,omie1241,Zuwadza,False,dialect,,,,,,0,0,0,
25436,zwal1238,atla1278,shal1242,Zwall,False,dialect,,,,,,0,0,0,
25437,zyph1238,sino1245,nucl1757,Zyphe,False,language,22.52400,93.2640,zyp,,,0,0,2,IN MM


In [13]:
## filtramos id:family_id (family)

location = languoid[['id','latitude','longitude']]
location = location.dropna()

In [14]:
location_lat = dict(zip(location['id'],location['latitude']))
location_lon = dict(zip(location['id'],location['longitude']))

In [15]:
location = {key:[location_lat[key],location_lon[key]] for key in location_lat.keys()}

In [16]:
len(location)

8437

In [17]:
## filtramos id:family_id (family)

id_family = languoid[['id','family_id']]
id_family = id_family.dropna()

In [18]:
## filtramos id:parent_id (genus)

glotto_iso = languoid[['id','iso639P3code']]
glotto_iso = glotto_iso.dropna()

In [19]:
## transformamos a diccionario

glotto_iso = dict(zip(glotto_iso['id'], glotto_iso['iso639P3code']))
glotto_iso = {language:glotto_iso[language] for language in glotto_iso.keys() if language in macroarea.keys()}

In [20]:
## transformamos a diccionario

id_family = dict(zip(id_family['id'], id_family['family_id']))
ID_family = {language:id_family[language] for language in id_family.keys() if  language in macroarea.keys()}

In [21]:
len(id_family)

25014

In [22]:
id_family['huil1244']

'arau1255'

## (2) Datos de SAILS

In [23]:
## rasgos

features_sails = pd.read_csv('features_sails.csv',sep=',')

In [24]:
features_sails=features_sails.rename(columns={'Unnamed: 0': 'features'})

In [25]:
len(list(features_sails.columns[1:]))

167

In [26]:
features=list(features_sails['features'])

In [27]:
features_pred = {}

for language in list(features_sails.columns[1:]):
    features_pred[language]=dict(zip(features,list(features_sails[language])))

In [28]:
for language in features_pred:
    D=features_pred[language]
    D={k: v for k, v in D.items() if pd.Series(v).notna().all()}
    D={key:D[key] for key in D.keys() if D[key][-2:]!='NK' and D[key][-2:]!='NA' and D[key].startswith('ARGEX5')==True} ## estructura argumental
    #D={key:D[key] for key in D.keys() if D[key][-2:]!='NK' and D[key].startswith('TAME')==True} ## tame
    #D={key:D[key] for key in D.keys() if D[key][-2:]!='NK' and D[key].startswith('AND')==True and D[key].startswith('ARW')==False}## todo sin arawak + andean specific
    
    features_pred[language]=D

In [29]:
print(len(features_pred))

167


In [30]:
pd.DataFrame(list(features_pred['mapu1245'].items()),columns=['rasgos', 'valores de rasgos'])

Unnamed: 0,rasgos,valores de rasgos
0,ARGEX5-1,ARGEX5-1-C
1,ARGEX5-2,ARGEX5-2-C
2,ARGEX5-3,ARGEX5-3-C
3,ARGEX5-4,ARGEX5-4-C
4,ARGEX5-5,ARGEX5-5-D
5,ARGEX5-6,ARGEX5-6-C
6,ARGEX5-7,ARGEX5-7-C
7,ARGEX5-8,ARGEX5-8-C
8,ARGEX5-9,ARGEX5-9-B


In [31]:
features_pred['mapu1245']

{'ARGEX5-1': 'ARGEX5-1-C',
 'ARGEX5-2': 'ARGEX5-2-C',
 'ARGEX5-3': 'ARGEX5-3-C',
 'ARGEX5-4': 'ARGEX5-4-C',
 'ARGEX5-5': 'ARGEX5-5-D',
 'ARGEX5-6': 'ARGEX5-6-C',
 'ARGEX5-7': 'ARGEX5-7-C',
 'ARGEX5-8': 'ARGEX5-8-C',
 'ARGEX5-9': 'ARGEX5-9-B'}

In [32]:
features_pred = {key:features_pred[key] for key in features_pred.keys() if len(features_pred[key])==9}# and key in id_family.keys()}

In [33]:
print(len(features_pred))

88


In [34]:
for language in features_pred.keys():
    print(language,len(features_pred[language]))

agua1253 9
yane1238 9
apur1254 9
apal1257 9
arhu1242 9
mapu1245 9
karo1305 9
araw1276 9
awet1244 9
cent2142 9
bora1263 9
boro1282 9
baur1253 9
chip1262 9
mose1249 9
cavi1250 9
chay1248 9
chol1284 9
coca1259 9
colo1256 9
cube1242 9
desa1247 9
emer1243 9
nort2972 9
guat1253 9
hixk1239 9
inga1252 9
iton1250 9
jama1261 9
hupd1244 9
juru1256 9
kama1373 9
kari1311 9
daww1239 9
awac1239 9
kano1245 9
leco1242 9
lako1248 9
sate1243 9
mats1244 9
moco1246 9
maku1278 9
wich1262 9
mund1330 9
movi1243 9
moch1259 9
pare1272 9
wari1268 9
paez1247 9
pila1245 9
puin1248 9
hual1241 9
sout2991 9
sout2990 9
sant1432 9
tena1240 9
yauy1235 9
ayac1239 9
cusc1236 9
caja1238 9
imba1240 9
nort2980 9
paca1245 9
sanm1289 9
huay1240 9
sala1272 9
jauj1238 9
cane1242 9
rikb1245 9
saba1268 9
nina1238 9
ship1254 9
saki1248 9
tari1256 9
tehu1242 9
tapi1253 9
trum1247 9
trio1238 9
ikpe1245 9
urar1246 9
uruu1244 9
wara1303 9
wayo1238 9
xava1240 9
kwaz1243 9
yami1256 9
nhen1239 9
yura1255 9


In [35]:
features = []

for language in features_pred.keys():
    features+=[list(features_pred[language].keys())]

In [36]:
features = set(features[0]).intersection(*features[1:])

In [37]:
len(features)

9

familias de las Américas

In [39]:
len(id_family)

25014

In [40]:
id_family = {}

for key in features_pred.keys():
    if key in ID_family.keys():
        id_family[key]=ID_family[key]
    else:
        id_family[key]='unknown'

In [41]:
len(id_family)

88

In [42]:
## agrupamos las lenguas que tienen la misma familia

from collections import defaultdict

v = defaultdict(list)

for key, value in sorted(id_family.items()):
    v[value].append(key)

In [43]:
families = dict(v)

In [44]:
len(families['araw1281'])

6

In [45]:
families

{'jiva1245': ['agua1253'],
 'cari1283': ['apal1257', 'hixk1239', 'ikpe1245', 'trio1238'],
 'araw1281': ['apur1254',
  'araw1276',
  'baur1253',
  'pare1272',
  'tari1256',
  'yane1238'],
 'chib1249': ['arhu1242'],
 'barb1265': ['awac1239', 'colo1256'],
 'tupi1275': ['awet1244',
  'coca1259',
  'emer1243',
  'juru1256',
  'kama1373',
  'kari1311',
  'karo1305',
  'maku1278',
  'mund1330',
  'nhen1239',
  'saki1248',
  'sate1243',
  'tapi1253',
  'wayo1238'],
 'quec1387': ['ayac1239',
  'caja1238',
  'cusc1236',
  'hual1241',
  'huay1240',
  'imba1240',
  'inga1252',
  'jauj1238',
  'nort2980',
  'paca1245',
  'sala1272',
  'sanm1289',
  'sant1432',
  'sout2990',
  'sout2991',
  'tena1240',
  'yauy1235'],
 'bora1262': ['bora1263'],
 'boro1281': ['boro1282'],
 'nucl1710': ['cane1242', 'rikb1245', 'xava1240'],
 'pano1259': ['cavi1250', 'mats1244', 'ship1254', 'yami1256'],
 'ayma1253': ['cent2142'],
 'cahu1265': ['chay1248'],
 'uruc1242': ['chip1262', 'uruu1244'],
 'hibi1242': ['chol1284'],

In [46]:
import numpy as np

L=[]
for fam in families.keys():
    L+=[len(families[fam])]
    print(fam,len(families[fam]))
print(np.mean(L))

jiva1245 1
cari1283 4
araw1281 6
chib1249 1
barb1265 2
tupi1275 14
quec1387 17
bora1262 1
boro1281 1
nucl1710 3
pano1259 4
ayma1253 1
cahu1265 1
uruc1242 2
hibi1242 1
tuca1253 2
nada1235 2
unknown 14
araw1282 1
namb1299 2
arau1255 1
guai1249 2
yano1268 1
choc1280 1
chon1288 1
chap1271 1
mata1289 1
3.259259259259259


## (3) entropy x feature

In [47]:
features_pred['mapu1245']

{'ARGEX5-1': 'ARGEX5-1-C',
 'ARGEX5-2': 'ARGEX5-2-C',
 'ARGEX5-3': 'ARGEX5-3-C',
 'ARGEX5-4': 'ARGEX5-4-C',
 'ARGEX5-5': 'ARGEX5-5-D',
 'ARGEX5-6': 'ARGEX5-6-C',
 'ARGEX5-7': 'ARGEX5-7-C',
 'ARGEX5-8': 'ARGEX5-8-C',
 'ARGEX5-9': 'ARGEX5-9-B'}

In [94]:
from math import log, e
def entropy(labels, base=None):
    value,counts = np.unique(labels, return_counts=True)
    norm_counts = counts / counts.sum()
    base = e if base is None else base
    return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()

In [104]:
#from scipy.stats import entropy

def entropy_area(feature,area):
    feature_values = [features_pred[L][feature] for L in area]
    return entropy(feature_values)#[dict_features[f] for f in feature_values])

In [105]:
entropy_area('ARGEX5-1',list(features_pred.keys()))

0.10847133903591605

In [106]:
entropy_area('ARGEX5-1',families['araw1281'])

-0.0

## (4) entropy x feature x family

In [111]:
D = {feature:{} for feature in features_pred['mapu1245'].keys()}

for feature in features_pred['mapu1245'].keys():
    for fam in families.keys():
        if len(families[fam])>=3:
            D[feature][fam]=entropy_area(feature,families[fam])

In [112]:
pd.DataFrame.from_dict(D)

Unnamed: 0,ARGEX5-1,ARGEX5-2,ARGEX5-3,ARGEX5-4,ARGEX5-5,ARGEX5-6,ARGEX5-7,ARGEX5-8,ARGEX5-9
cari1283,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.562335
araw1281,-0.0,-0.0,0.450561,0.636514,0.450561,0.450561,0.450561,0.693147,-0.0
tupi1275,-0.0,-0.0,0.257319,0.410116,0.257319,0.257319,0.898205,0.655976,0.257319
quec1387,-0.0,-0.0,0.223718,-0.0,0.223718,-0.0,-0.0,0.649248,-0.0
nucl1710,-0.0,0.636514,-0.0,-0.0,0.636514,0.636514,1.098612,-0.0,0.636514
pano1259,-0.0,-0.0,-0.0,-0.0,0.562335,-0.0,0.562335,-0.0,-0.0
unknown,-0.0,0.410116,0.410116,0.693147,1.028514,0.892118,0.9557,1.060944,0.655976


## (5) entropy x feature x geographical area

In [136]:
areas_birchall = pd.read_csv('areas_birchall.csv',sep=',')

In [137]:
areas_birchall = dict(zip(areas_birchall['languages'],areas_birchall['areas']))

In [138]:
areas_birchall = {key:areas_birchall[key] for key in features_pred.keys()}

In [139]:
## agrupamos las lenguas que tienen la misma familia

from collections import defaultdict

v = defaultdict(list)

for key, value in sorted(areas_birchall.items()):
    v[value].append(key)

In [140]:
areas = dict(v)

In [141]:
D_areas = {feature:{} for feature in features_pred['mapu1245'].keys()}

for feature in features_pred['mapu1245'].keys():
    for fam in areas.keys():
        if len(areas[fam])>=3:
            D_areas[feature][fam]=entropy_area(feature,areas[fam])/entropy(range(len(areas[fam])))

In [142]:
pd.DataFrame.from_dict(D_areas)

Unnamed: 0,ARGEX5-1,ARGEX5-2,ARGEX5-3,ARGEX5-4,ARGEX5-5,ARGEX5-6,ARGEX5-7,ARGEX5-8,ARGEX5-9
4,-0.0,0.204417,0.237672,0.204417,0.477485,0.370902,0.40674,0.311693,0.074232
2,-0.0,0.210758,-0.0,-0.0,0.210758,0.210758,0.409223,-0.0,0.210758
1,0.141182,0.217322,0.217322,-0.0,0.389973,0.217322,0.277528,0.348225,-0.0
5,-0.0,0.086605,0.113992,0.209083,0.214502,0.197275,0.308722,0.326587,0.171045
3,-0.0,-0.0,0.21382,0.127844,0.265767,0.164477,0.164477,0.229156,-0.0
6,-0.0,0.15876,0.15876,0.31265,0.24108,0.311183,0.426396,0.28969,0.452856


## (6) entropy x feature x birchall area

In [129]:
areas_birchall_WE = {}

for language in features_pred.keys():
    if areas_birchall[language] in [1,3,4,7]:
        areas_birchall_WE[language]=0
    else:
        areas_birchall_WE[language]=1

In [130]:
## agrupamos las lenguas que tienen la misma familia

from collections import defaultdict

v = defaultdict(list)

for key, value in sorted(areas_birchall_WE.items()):
    v[value].append(key)

In [131]:
areas = dict(v)

In [132]:
D_areas = {feature:{} for feature in features_pred['mapu1245'].keys()}

for feature in features_pred['mapu1245'].keys():
    for fam in areas.keys():
        if len(areas[fam])>=3:
            D_areas[feature][fam]=entropy_area(feature,areas[fam])

In [133]:
pd.DataFrame.from_dict(D_areas)

Unnamed: 0,ARGEX5-1,ARGEX5-2,ARGEX5-3,ARGEX5-4,ARGEX5-5,ARGEX5-6,ARGEX5-7,ARGEX5-8,ARGEX5-9
0,0.175975,0.456223,0.674699,0.488375,1.246212,0.874616,0.898536,0.922995,0.102967
1,-0.0,0.319691,0.319691,0.656712,0.669225,0.628813,1.027119,0.979479,0.677086
