## Libraries etc.

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import networkx as nx
import subprocess

import os
import rdata
import dill as pickle

from collections import Counter, defaultdict
from datetime import datetime
from matplotlib import pyplot as plt
from itertools import combinations
#
from multiprocessing import Pool
import rdata
import warnings
from scipy.stats import entropy
warnings.filterwarnings("ignore")
#
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [2]:
# files
pickle_dir = './pickles/'
imputed_data_file = pickle_dir + 'imputed_data.xlsx'
pcaspace_dir = pickle_dir + '/pcaspaces/'
# parameters
jaccard_threshold=0.9
require_ft_present = True
min_n_tokens = 20
sampling_method = 'convenience'

## Part 2: analysis

In [3]:
df = pd.read_excel(imputed_data_file)

In [4]:
# sampling method
tot_wc = {l["doculect"] : (l['count']) for i,l in pd.read_csv('word_counts.csv').iterrows()}
doreco_metadata = pd.read_csv('doreco_languages_metadata_sep2023.csv')
doreco_metadata

Unnamed: 0,Language,Glottocode,iso-639-3,Family,fam_glottocode,Area,Creator,Latitude,Longitude,Archive,...,Audio license,DOI,Gloss,Extended speakers,Extended word tokens,Extended texts,Core speakers,core word tokens,Core texts,Years of recordings in core set
0,Anal,anal1239,anm,Sino-Tibetan,sino1245,Eurasia,"Ozerov, Pavel",24.05,94.28,ELAR,...,CC BY-NC,10.34847/nkl.0dbazp8m,none,12,13015,23,12,13015,23,2015-2016
1,Yali (Apahapsili),apah1238,na,Nuclear Trans New Guinea,nucl1709,Papunesia,"Riesberg, Sonja",-4.08,139.46,TLA,...,CC BY-NC-SA,10.34847/nkl.9d91nkq2,all,14,10191,11,10,7474,8,2013-2017
2,Arapaho,arap1274,arp,Algic,algi1248,North America,"Cowell, Andrew",43.39,-108.81,ELAR,...,CC BY,10.34847/nkl.36f5r1b6,all,8,10407,21,4,4746,11,2005
3,Baïnounk Gubëeher,bain1259,bab,Atlantic-Congo,atla1278,Africa,"Cobbinah, Alexander Yao",12.31,-16.06,ELAR,...,CC BY,10.34847/nkl.a332abw8,some,10,12425,18,9,11477,17,2009-2014
4,Beja,beja1238,bej,Afro-Asiatic,afro1255,Africa,"Vanhove, Martine",17.24,36.67,CorpoAfroas,...,CC BY-NC,10.34847/nkl.edd011t1,all,5,15439,58,5,15439,58,2003-2011
5,Bora,bora1263,boa,Boran,bora1262,South America,"Seifart, Frank",-2.0,-72.26,TLA,...,CC BY,10.34847/nkl.6eaf5laq,all,39,29578,38,6,8431,9,2004-2008
6,Cabécar,cabe1245,cjp,Chibchan,chib1249,North America,"Quesada, Juan Diego and Skopeteas, Stavros and...",9.67,-83.41,TLA,...,CC BY-NC-ND,10.34847/nkl.ebc4ra22,all,15,17961,115,10,10614,39,2011
7,Cashinahua,cash1254,cbs,Pano-Tacanan,pano1259,South America,"Reiter, Sabine",-9.72,-71.17,TLA,...,CC BY,10.34847/nkl.a8f9q2f1,some,3,10043,3,3,10043,3,2006
8,Dolgan,dolg1241,dlg,Turkic,turk1311,Eurasia,"Däbritz, Chris Lasse and Kudryakova, Nina and ...",71.11,94.29,HZSK,...,CC BY-NC,10.34847/nkl.f09eikq3,all,20,18966,25,6,8778,9,1972-2010
9,Evenki,even1259,evn,Tungusic,tung1282,Eurasia,"Kazakevich, Olga and Klyachko, Elena",61.97,94.69,Siberian Lang,...,CC BY,10.34847/nkl.5e0d27cu,all,23,8315,36,23,8315,36,2006-2016


In [5]:
area = defaultdict(lambda : [])
for f,c in Counter(doreco_metadata.Family).most_common():
    glottocodes = list(doreco_metadata[doreco_metadata.Family==f].Glottocode)
    found_glottocodes = [g for g in glottocodes if g in df.doculect.unique()]
    area_f = Counter(doreco_metadata[doreco_metadata.Family==f].Area).most_common(1)[0][0]
    #print(f,c,len(found_glottocodes),area_f)
    best_glottocode = max(found_glottocodes, key = tot_wc.get, default=None)
    # sample doculect with most words per genus
    if sampling_method in {'genus', 'genus-area'}:
        if best_glottocode != None: 
            area[area_f].append(best_glottocode)
    elif sampling_method == 'convenience':
        area[area_f].extend(found_glottocodes)
sample = []
for k,v in area.items():
    if sampling_method == 'genus-area' : 
        best_v = sorted(v, key = tot_wc.get)[-3:]
        # if method is genus-are, sample 3 doculects with most words per area
    else : best_v = v
    print(k,len(v),best_v)
    sample += best_v
print('>>>', sampling_method, len(sample), sample)
sampled_docs = set(sample)

Papunesia 9 ['nisv1234', 'nort2875', 'orko1234', 'port1286', 'sout2856', 'teop1238', 'vera1241', 'apah1238', 'komn1238']
Eurasia 16 ['anal1239', 'sadu1234', 'sumi1235', 'yong1270', 'lowe1385', 'nort2641', 'sout3282', 'dolg1241', 'urum1249', 'jeha1242', 'pnar1238', 'sanz1248', 'taba1259', 'even1259', 'kama1351', 'svan1243']
Africa 9 ['beja1238', 'goem1240', 'goro1270', 'bain1259', 'ruul1235', 'kaka1265', 'kark1256', 'nngg1234', 'tsim1256']
South America 7 ['movi1243', 'savo1255', 'yura1255', 'resi1247', 'trin1278', 'bora1263', 'cash1254']
North America 5 ['arap1274', 'cabe1245', 'hoch1243', 'texi1237', 'yuca1254']
Australia 1 ['ngal1292']
>>> convenience 47 ['nisv1234', 'nort2875', 'orko1234', 'port1286', 'sout2856', 'teop1238', 'vera1241', 'apah1238', 'komn1238', 'anal1239', 'sadu1234', 'sumi1235', 'yong1270', 'lowe1385', 'nort2641', 'sout3282', 'dolg1241', 'urum1249', 'jeha1242', 'pnar1238', 'sanz1248', 'taba1259', 'even1259', 'kama1351', 'svan1243', 'beja1238', 'goem1240', 'goro1270'

In [6]:
df_sub = df[df.doculect.isin(sampled_docs)]
print('n Sampled tokens: %d | n Original Tokens: %d' % (len(df_sub), len(df)))

n Sampled tokens: 146821 | n Original Tokens: 146821


In [10]:
field_anchors = []
fields = []
for doc in sampled_docs:
    print(doc)
    markers = df_sub[df_sub.doculect == doc].marker.unique()
    for m in markers:
        if require_ft_present: field = np.array((df_sub['imputed_%s' % doc] == m) & (df_sub['ft_present_%s' % doc]))
        else: field = np.array((df_sub['imputed_%s' % doc] == m))
        if sum(field) == 0: print(doc, m); continue
        fields.append(field)
        field_anchors.append((doc,m))
fields = np.array(fields)
print('n Fields: %d' % len(fields))

anal1239
port1286
sadu1234
sout3282
goem1240
dolg1241
beja1238
nort2641
arap1274
komn1238
movi1243
savo1255
kaka1265
yuca1254
bain1259
yura1255
resi1247
cabe1245
sumi1235
bora1263
texi1237
yong1270
ruul1235
kama1351
trin1278
nisv1234
nisv1234 nan
ngal1292
ngal1292 nan
even1259
urum1249
tsim1256
taba1259
hoch1243
goro1270
pnar1238
jeha1242
jeha1242 nan
orko1234
kark1256
sanz1248
lowe1385
nort2875
sout2856
vera1241
cash1254
teop1238
svan1243
apah1238
nngg1234
n Fields: 9535


In [11]:
def jaccard_similarity(si,sj):
    return (len(si & sj) / len(si | sj))

def get_merged_fields(fields, field_anchors, jaccard_threshold):
    sets = [set(np.where(fields[i])[0]) for i in range(fields.shape[0])]
    with Pool(12) as p:
        similarities = p.starmap(jaccard_similarity, map(lambda c : (sets[c[0]],sets[c[1]]), combinations(range(fields.shape[0]),2)))
    similar_field_pairs = {c for c,sim in zip(combinations(range(fields.shape[0]), 2), similarities) if sim >= jaccard_threshold}
    #
    G = nx.Graph()
    G.add_nodes_from(np.arange(len(fields)))
    G.add_edges_from(similar_field_pairs)
    #
    new_fields, new_anchors = [], []
    for c in nx.connected_components(G):
        similarities_c = [jaccard_similarity(sets[i],sets[j]) for i,j in combinations(c, 2)]
        new_field = np.zeros(fields.shape[1]).astype(bool)
        new_field[list(set.union(*[set(np.where(fields[ci])[0]) for ci in c]))] = True
        new_fields.append(new_field)
        new_anchors.append([field_anchors[ci] for ci in c])
        #
        if len(c) > 1: print('n Terms: %d | avg Jaccard sim: %.3f | min Jaccard sim: %.3f | n terms: %d' %
                             (len(c), np.mean(similarities_c), min(similarities_c), sum(new_field)))
    new_fields = np.array(new_fields)
    print('n New Fields: %d | n Old Fields: %d' % (len(new_fields), len(fields)))
    return new_fields, new_anchors

new_fields, new_anchors = get_merged_fields(fields, field_anchors, jaccard_threshold)

n Terms: 12 | avg Jaccard sim: 0.926 | min Jaccard sim: 0.839 | n terms: 820
n Terms: 17 | avg Jaccard sim: 0.883 | min Jaccard sim: 0.762 | n terms: 3083
n Terms: 11 | avg Jaccard sim: 0.904 | min Jaccard sim: 0.831 | n terms: 2253
n Terms: 17 | avg Jaccard sim: 0.922 | min Jaccard sim: 0.858 | n terms: 1021
n Terms: 13 | avg Jaccard sim: 0.947 | min Jaccard sim: 0.876 | n terms: 864
n Terms: 7 | avg Jaccard sim: 0.927 | min Jaccard sim: 0.864 | n terms: 201
n Terms: 5 | avg Jaccard sim: 0.903 | min Jaccard sim: 0.882 | n terms: 425
n Terms: 11 | avg Jaccard sim: 0.911 | min Jaccard sim: 0.829 | n terms: 541
n Terms: 4 | avg Jaccard sim: 0.925 | min Jaccard sim: 0.865 | n terms: 513
n Terms: 23 | avg Jaccard sim: 0.900 | min Jaccard sim: 0.793 | n terms: 1644
n Terms: 3 | avg Jaccard sim: 0.964 | min Jaccard sim: 0.947 | n terms: 430
n Terms: 14 | avg Jaccard sim: 0.940 | min Jaccard sim: 0.884 | n terms: 645
n Terms: 21 | avg Jaccard sim: 0.930 | min Jaccard sim: 0.842 | n terms: 419

In [12]:
def short_to_long(data_short):
    doculects = data_short.doculect.unique()
    index = 0
    colnames = []
    matrix = np.zeros((len(data_short),0)).astype(int)
    for doc in map(lambda k: 'imputed_' + k, doculects):
        ct = {ei : i for i,ei in enumerate(filter(lambda k : isinstance(k, str), data_short[doc].unique()))}
        matrix = np.hstack([matrix, np.zeros((len(data_short),len(ct)))])
        for ei,i in ct.items():
            colnames.append(doc + '|' + ei)
            matrix[:,index+i] = (np.array(data_short[doc] == ei).astype(int))#[int(e==ei) for e in data_short[doc]]
        index += len(ct)
    data_long = pd.DataFrame(matrix, columns=colnames)
    return data_long

In [15]:
def get_pc1(fn):
    identifier = os.getpid()
    subprocess.call(['Rscript', 'exppca.R', fn, str(identifier), 'logisticSVD', str(1)])
    model = rdata.conversion.convert(rdata.parser.parse_file('%s.rda' % identifier))['model']
    subprocess.call(['rm', '%s.rda' % identifier])
    return model

In [19]:
pickle.dump((new_anchors,new_fields), open('%s/fields.p' % pickle_dir, 'wb'))

In [20]:
def parallel_pca(x, boolean):
    doc, marker = x
    if os.path.isfile('%s/pcaspace_%s_%s.p' % (pcaspace_dir, doc, marker)): 
        return
    df_short = df_sub[boolean]
    df_long = short_to_long(df_short)
    good_columns = df_long.columns[(df_long.sum(0) > 0) & (df_long.sum(0) < len(df_long))]
    if len(good_columns) == 0: return
    fn = '%s/termspace_%s_%s.csv' % (pcaspace_dir, doc, marker)
    df_long[good_columns].to_csv(fn, index=False)
    #
    try: model = get_pc1(fn)
    except: model = None
    #
    subprocess.call(['rm', '%s/termspace_%s_%s.csv' % (pcaspace_dir, doc, marker)])
    if model != None: 
        pickle.dump(model, open('%s/pcaspace_%s_%s.p' % (pcaspace_dir, doc, marker), 'wb'))
        print(doc, marker, model['prop_deviance_expl'][0], datetime.now())

try: os.mkdir(pcaspace_dir)
except: pass
with Pool(8) as p:
    p.starmap(parallel_pca, zip(map(lambda k : k[0], new_anchors), new_fields))

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted
Error: TridiagEigen: eigen decomposition failed
Execution halted
Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted
Error: TridiagEigen: eigen decomposition failed
Execution halted
Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted
Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted
Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <

ngal1292 h-bobon 0.2613770493916292 2024-03-25 14:08:49.257895


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


resi1247 ^ekaaka$ 0.1258335093578783 2024-03-25 14:08:53.986495
resi1247 htona 0.2968045076571354 2024-03-25 14:09:02.710268
ngal1292 -yenj 0.26683367172990013 2024-03-25 14:09:04.750987
resi1247 ^jipoji$ 0.3097988443717079 2024-03-25 14:09:05.346598
ruul1235 ruk 0.2753686378436554 2024-03-25 14:09:21.700682
ngal1292 -bu 0.30330695082720494 2024-03-25 14:09:22.464776
ngal1292 dje- 0.2664172576174655 2024-03-25 14:09:23.690231
ngal1292 ^nje 0.2377483781023807 2024-03-25 14:09:27.470077
yong1270 ^@tso$ 0.3510842103931292 2024-03-25 14:09:31.050917
ruul1235 oma 0.5275649745012491 2024-03-25 14:09:38.687974
tsim1256 ^gway$ 0.2532580020596823 2024-03-25 14:09:53.557832
tsim1256 she 0.2660914629127785 2024-03-25 14:09:57.645876
yong1270 ^le-tu-po-hw-dzo$ 0.19211237199440434 2024-03-25 14:09:57.649844
ngal1292 nginj$ 0.1759777680966934 2024-03-25 14:09:58.203470
tsim1256 anj 0.28557491740110186 2024-03-25 14:09:59.856333
ruul1235 a:ka$ 0.28041686702591684 2024-03-25 14:10:01.936239
yong1270 ^

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


kark1256 undu 0.3820944756200616 2024-03-25 14:11:02.371143
kark1256 ^burjil 0.29659107401076146 2024-03-25 14:11:08.522471
cabe1245 eu 0.17759581993431206 2024-03-25 14:11:09.115078
kark1256 ^kat 0.17038213241379097 2024-03-25 14:11:16.357426
kark1256 ^kasi$ 0.35133196826737434 2024-03-25 14:11:20.000319
tsim1256 adiy 0.5327762672843978 2024-03-25 14:11:22.094795
kark1256 idu 0.29193793953685787 2024-03-25 14:11:23.713955
tsim1256 siin 0.25945406957849415 2024-03-25 14:11:24.393210
kark1256 tii 0.31726533788054256 2024-03-25 14:11:26.219188
kark1256 ^ja 0.268722790265182 2024-03-25 14:11:27.611445


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


ngal1292 ^mey 0.2895029285734806 2024-03-25 14:11:31.016642
kark1256 ^kanandura$ 0.4216339195899429 2024-03-25 14:11:31.555070
tsim1256 ey 0.14896821490194057 2024-03-25 14:11:36.814949
tsim1256 jiil 0.4220980608266005 2024-03-25 14:11:46.589942
tsim1256 gon 0.3232365486326242 2024-03-25 14:11:52.039278
tsim1256 ^gelaseew$ 0.42717525844765036 2024-03-25 14:11:53.252458
tsim1256 quwa 0.32101571369566595 2024-03-25 14:12:02.045522
tsim1256 are 0.5841840746679788 2024-03-25 14:12:03.310051
cabe1245 bata$ 0.2898302214834131 2024-03-25 14:12:05.871940
tsim1256 ^qada 0.302637259653351 2024-03-25 14:12:08.866878
tsim1256 dah 0.22229080922589461 2024-03-25 14:12:10.125658
tsim1256 emo 0.3192527126838397 2024-03-25 14:12:11.367978
kark1256 ^kula 0.43735543735526805 2024-03-25 14:12:12.551535
tsim1256 ^nadeeg$ 0.38896172609902224 2024-03-25 14:12:12.560176
tsim1256 ^law 0.26077823692782165 2024-03-25 14:12:14.547324
kark1256 ^kam 0.3981778630813322 2024-03-25 14:12:15.346794
tsim1256 ^mang' 0.33

In logisticSVD(my_data, k = as.integer(args[4])) :
  Algorithm ran 1000 iterations without converging.
              You may want to run it longer.


urum1249 ^aldi 0.3603538355699245 2024-03-25 14:15:55.154014
ruul1235 ^okugusja:tukiraku$ 0.23417315427064422 2024-03-25 14:16:00.895360
even1259 huru 0.24674266207466655 2024-03-25 14:16:02.968115
even1259 ^suru 0.3343461729220004 2024-03-25 14:16:04.150044
urum1249 ^galdi$ 0.23278151306598271 2024-03-25 14:16:19.689111
orko1234 ^enten$ 0.40677286577343696 2024-03-25 14:16:29.914569
orko1234 ^man$ 0.34497414554889183 2024-03-25 14:16:34.292682
even1259 ^n'ene 0.2528874724548287 2024-03-25 14:16:41.110683
even1259 ^girku 0.22206965912182297 2024-03-25 14:16:43.966212
orko1234 ^em$ 0.27882975789592324 2024-03-25 14:16:45.308724
orko1234 ^siem$ 0.5809872466361702 2024-03-25 14:16:47.061852
orko1234 tee 0.21625562300391787 2024-03-25 14:16:49.664119
hoch1243 ^wagagaxire$ 0.1487608666558291 2024-03-25 14:16:50.223750
even1259 ^iu 0.2957203999469904 2024-03-25 14:16:51.627766
even1259 sty 0.2718750175422697 2024-03-25 14:16:52.984784
hoch1243 iji 0.2553496105788875 2024-03-25 14:16:53.83738

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


urum1249 ^guzal 0.260443060452197 2024-03-25 14:19:21.178809
sanz1248 ^deq'a$ 0.19469317506845496 2024-03-25 14:19:22.673486
urum1249 ^gurban 0.20985771442265277 2024-03-25 14:19:23.064750
sanz1248 ^ssussu 0.2762756145765055 2024-03-25 14:19:23.939136
sanz1248 ^tsara 0.4033014801019399 2024-03-25 14:19:25.151841
kark1256 ^wot 0.3853322038380578 2024-03-25 14:19:26.307309
sanz1248 ^kh1urm 0.2754337793594567 2024-03-25 14:19:26.443324


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 ^kurs 0.1554073597243797 2024-03-25 14:19:31.358134
sanz1248 ^ts1ialdarkkiiab$ 0.2752319943924475 2024-03-25 14:19:35.163291
sanz1248 ^qus 0.32912908758157344 2024-03-25 14:19:36.464214
orko1234 ^we$ 0.3833887457157885 2024-03-25 14:19:38.959464
orko1234 ^obwer$ 0.3026301929739238 2024-03-25 14:19:40.236109
urum1249 ^old 0.22590493962187097 2024-03-25 14:19:43.306806
sanz1248 ^mis 0.13328879235739133 2024-03-25 14:19:43.827668
orko1234 ^isan$ 0.2448588442850247 2024-03-25 14:19:44.447099


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


kark1256 ^wat 0.2555805697241278 2024-03-25 14:19:46.569778
orko1234 ke$ 0.23778232421663403 2024-03-25 14:19:46.833679
urum1249 ^isti 0.21630863024750935 2024-03-25 14:19:47.623606
sanz1248 ^turag'adashcheri$ 0.16189430150681872 2024-03-25 14:19:47.730513
kark1256 ^de 0.28047827048276064 2024-03-25 14:19:48.647694
orko1234 or$ 0.2922118411553577 2024-03-25 14:19:48.754991
sanz1248 ^dukh'ab$ 0.2699775092780341 2024-03-25 14:19:48.971349
kark1256 doni$ 0.4974430769017112 2024-03-25 14:19:50.018887
kark1256 ungur 0.34095850843867925 2024-03-25 14:19:52.190700
urum1249 ^ista 0.3937044480905272 2024-03-25 14:19:52.764200
kark1256 ^fili 0.557309439801426 2024-03-25 14:19:53.380957
urum1249 ^air 0.802749305259754 2024-03-25 14:19:53.889867
kark1256 ^dung 0.74964417367821 2024-03-25 14:19:54.466096
urum1249 lica 0.6842347576009143 2024-03-25 14:19:55.118028
sanz1248 ikuri 0.9855252954883859 2024-03-25 14:19:56.169862
urum1249 ^tur 0.3070863174052316 2024-03-25 14:19:57.311212
urum1249 ^gretsi

In logisticSVD(my_data, k = as.integer(args[4])) :
  Algorithm ran 1000 iterations without converging.
              You may want to run it longer.


kark1256 ^sima 0.30290395023036587 2024-03-25 14:20:24.182159
urum1249 ^balkalari$ 0.13774078483206564 2024-03-25 14:20:26.455341
hoch1243 ^hocici 0.38522155620765275 2024-03-25 14:20:26.478995
hoch1243 isik 0.6098678476701429 2024-03-25 14:20:27.695605
orko1234 ele 0.3108646158418965 2024-03-25 14:20:28.637155
hoch1243 ^haastik$ 0.4016526333621516 2024-03-25 14:20:28.895763
orko1234 ^fwerfaar$ 0.34899424256407674 2024-03-25 14:20:30.321592
hoch1243 ^zuura 0.36085222835123854 2024-03-25 14:20:31.709096
even1259 ^provalivat'sia$ 0.12835359146325642 2024-03-25 14:20:31.745901
orko1234 byosol$ 0.3592496122203982 2024-03-25 14:20:33.016867
orko1234 ^mwesengesenge$ 0.2897597250333984 2024-03-25 14:20:34.959191
orko1234 ^jioj$ 0.42239242547936473 2024-03-25 14:20:36.492300
orko1234 ^skul$ 0.602792050149759 2024-03-25 14:20:37.689032
orko1234 taala$ 0.8635756655061234 2024-03-25 14:20:38.843531
kark1256 ^buul 0.25490957799507297 2024-03-25 14:20:40.995945
kark1256 ^nabbah$ 0.4535601011005306 

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 ^hext: 0.11726343104091486 2024-03-25 14:22:48.377959
tsim1256 ^qareed$ 0.3559169184951745 2024-03-25 14:22:55.647749
tsim1256 yam$ 0.19505102833725996 2024-03-25 14:23:00.191400
tsim1256 ^gumd$ 0.3461801185394382 2024-03-25 14:23:02.698048
tsim1256 ^gedig$ 0.25952189139376536 2024-03-25 14:23:06.117389
tsim1256 ^qweng 0.6412872062210122 2024-03-25 14:23:07.358090
tsim1256 wo 0.3402206207619496 2024-03-25 14:23:11.251179
sanz1248 ^ab 0.17511118808631643 2024-03-25 14:23:13.166216
tsim1256 ^aqway 0.3873289371069827 2024-03-25 14:23:14.256477
sanz1248 kari 0.11666974563814703 2024-03-25 14:23:15.926272
orko1234 ^ol$ 0.14195418291607265 2024-03-25 14:23:16.207825
tsim1256 mang 0.2606464787250701 2024-03-25 14:23:17.038194
sanz1248 ^t1 0.4670564969761035 2024-03-25 14:23:17.107616
orko1234 ^wowa$ 0.14301773128932072 2024-03-25 14:23:17.815134
orko1234 ^bul 0.31130170161636683 2024-03-25 14:23:19.035705
sanz1248 uq 0.1890404386473883 2024-03-25 14:23:19.132673
tsim1256 jood 0.30134

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


urum1249 ^vera 0.6626700258840874 2024-03-25 14:24:21.236185
sanz1248 ik' 0.16524104877433798 2024-03-25 14:24:24.143867
even1259 ^ise 0.3066466200876071 2024-03-25 14:24:26.045291
urum1249 ^egi 0.4495559973263554 2024-03-25 14:24:26.370572
urum1249 ^avda 0.6017371389873226 2024-03-25 14:24:27.620451
kama1351 ^ibi 0.1499697545957258 2024-03-25 14:24:28.143816
urum1249 ^ava$ 0.4088917862208735 2024-03-25 14:24:28.895802
sanz1248 ik1v 0.16676707995195594 2024-03-25 14:24:28.957223
sanz1248 ^gurabeg'iblev$ 0.14522011892218345 2024-03-25 14:24:29.194770
sanz1248 ^kh1iadurg'ittikhuble$ 0.4442900648046342 2024-03-25 14:24:30.272074
sanz1248 ?ib 0.19976860496900295 2024-03-25 14:24:30.309414
orko1234 ^melee$ 0.2816539023305199 2024-03-25 14:24:31.085807
sanz1248 k1ul$ 0.7138218720113738 2024-03-25 14:24:31.450240
urum1249 ^ak 0.215333799260916 2024-03-25 14:24:31.689173
sanz1248 ikka 0.37232791130683496 2024-03-25 14:24:32.709054
urum1249 ^peinir$ 0.5820240576896774 2024-03-25 14:24:33.018155

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 burs 0.24756440015067416 2024-03-25 14:25:42.729702
sout2856 ^sprei$ 0.306829790453096 2024-03-25 14:25:43.501567
sanz1248 ^ila$ 0.3400030201176515 2024-03-25 14:25:44.072249
even1259 ^kik$ 0.42928208466751994 2024-03-25 14:25:44.238087
sout2856 ^tilmori$ 0.34397634134441935 2024-03-25 14:25:46.256837


Error: TridiagEigen: eigen decomposition failed
Execution halted


sout2856 ^kafman$ 0.6833804687600367 2024-03-25 14:25:48.710074
sanz1248 ^?a?h 0.31145221864159045 2024-03-25 14:25:48.988823
kama1351 ^de 0.28503772584645437 2024-03-25 14:25:49.175908
sout2856 sir 0.43436910736820533 2024-03-25 14:25:50.069790


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 1iakh 0.40472620578388885 2024-03-25 14:25:51.452874
sout2856 msak 0.33460707940555645 2024-03-25 14:25:51.544923
kama1351 ^kumbi 0.3870844328414643 2024-03-25 14:25:51.613438
sout2856 ^namlas$ 0.8083490607430551 2024-03-25 14:25:52.646751
kama1351 ^embi 0.21613511818693643 2024-03-25 14:25:53.851279
sout2856 ^nawi$ 0.6752145920439944 2024-03-25 14:25:53.889573
sout2856 kal$ 0.24832484228614826 2024-03-25 14:25:55.197519
goro1270 ^ina$ 0.23944523800520523 2024-03-25 14:25:55.549637
sout2856 ^asle 0.3097473182303001 2024-03-25 14:25:57.405668
goro1270 ^ay$ 0.13594263423815278 2024-03-25 14:25:57.605715
sout2856 ^los$ 0.5049625114841112 2024-03-25 14:25:58.588995
kama1351 ^ser 0.2997327936399776 2024-03-25 14:25:58.598144
goro1270 ^iimi$ 0.5973690717122856 2024-03-25 14:25:58.877793
sout2856 ^afsak$ 0.36192279228729884 2024-03-25 14:26:00.428909
kama1351 ^en 0.19215850488011943 2024-03-25 14:26:00.607880
orko1234 ^graunem$ 0.2253348251596109 2024-03-25 14:26:01.604691
sout2856 p

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


even1259 ^navekh 0.36614826289218527 2024-03-25 14:26:08.491238
even1259 ^sita 0.3730444943161314 2024-03-25 14:26:10.179594
kama1351 ^nuk 0.32812863820570204 2024-03-25 14:26:10.522797
sout2856 lao$ 0.23640441761748543 2024-03-25 14:26:11.119075
sanz1248even1259 ^bar$  ^ilkend0.2772425715231829  0.362930960706655542024-03-25 14:26:12.716074 
2024-03-25 14:26:12.716866
sout2856 tpolu$ 0.3054401806779675 2024-03-25 14:26:12.898144
orko1234 fyang$ 0.2989115469823135 2024-03-25 14:26:13.440045
sanz1248 sout2856^bare  ^nlag0.4976242665624785  0.289906047868347842024-03-25 14:26:14.596652 
2024-03-25 14:26:14.597888
kama1351 ^ne 0.6273618225393995 2024-03-25 14:26:15.868674
sanz1248 ari$ 0.30762843864088585 2024-03-25 14:26:16.138080
orko1234 mela$ 0.23386998033883233 2024-03-25 14:26:16.307559
sout2856 ^apu$ 0.2866585555048585 2024-03-25 14:26:16.988444
orko1234 ron$ 0.3657574726741696 2024-03-25 14:26:17.544011
kama1351 ^ma? 0.20309107866613618 2024-03-25 14:26:18.006567
sout2856 palus$ 0

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


kama1351 ^u? 0.2352410173738544 2024-03-25 14:26:24.229829
goro1270 ^oo$ 0.18070961065975766 2024-03-25 14:26:25.462967
kama1351 ^ner 0.5391259943668933 2024-03-25 14:26:25.610635
kama1351 ^tara 0.34660919093754783 2024-03-25 14:26:27.055258
kama1351 ^kuro 0.312905278699416 2024-03-25 14:26:28.976011
sanz1248 ^bakh1ts1atse$ 0.3020930337411595 2024-03-25 14:26:29.955867
kama1351 ^talaru?pi$ 0.2697989104250259 2024-03-25 14:26:30.850021
taba1259 us 0.17040134081293745 2024-03-25 14:26:30.856291
even1259 ^chainikilve$ 0.18151217997943336 2024-03-25 14:26:31.054667
orko1234 fwier$ 0.29576652455937635 2024-03-25 14:26:31.292474
kama1351 ^uz@bi 0.8514783659968912 2024-03-25 14:26:32.074473
orko1234 ku$ 0.20172572901094366 2024-03-25 14:26:32.960739
orko1234 ^maler$ 0.7199157275394925 2024-03-25 14:26:34.072131
taba1259 udu 0.1285953028597665 2024-03-25 14:26:34.750906
sout2856 ^nskau$ 0.12233794252372487 2024-03-25 14:26:35.748678


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sout2856 metpak 0.3357308047393964 2024-03-25 14:26:38.078492
taba1259 rak 0.23012529547764482 2024-03-25 14:26:38.907672
orko1234 ^abare$taba1259  0.2537111781974928ru?ru$  2024-03-25 14:26:40.3830220.2189230173938319
 2024-03-25 14:26:40.384593
even1259 ^bald 0.2870397055609978 2024-03-25 14:26:40.659528
sout2856 matur$ 0.4475710582892388 2024-03-25 14:26:41.387933
even1259 ^abachi 0.40131782065883026 2024-03-25 14:26:42.100567
orko1234 ^bililen$ 0.35380397746398595 2024-03-25 14:26:42.394296


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


taba1259 ^rac$ 0.4654577523412504 2024-03-25 14:26:43.296301
even1259 ^man'i 0.6696581469527685 2024-03-25 14:26:43.304823
sout2856 smok$ 0.5814805385327706 2024-03-25 14:26:43.644610
sout2856 ^imo 0.49921625738779163 2024-03-25 14:26:44.835209
orko1234 ^imiel$ 0.43118412825494956 2024-03-25 14:26:45.323227
kama1351 ^il$ 0.2133406539899505 2024-03-25 14:26:45.789314
orko1234 isi 0.307952538846225 2024-03-25 14:26:46.714749
sout2856 ^naal$ 0.19687153607883234 2024-03-25 14:26:47.318451
even1259 ^goro 0.3230103470649146 2024-03-25 14:26:47.686695
orko1234 ^bilok$ 0.2601283661718118 2024-03-25 14:26:47.963578
sout2856 pnak 0.5618202864360073 2024-03-25 14:26:48.513296
even1259 ^vremia$ 0.21586666440700109 2024-03-25 14:26:49.631715
sout2856 ^nalag$ 0.32954617430522903 2024-03-25 14:26:50.359872
sanz1248 kun 0.28979876802502336 2024-03-25 14:26:50.639293
even1259 ^davno$ 0.42313758879558616 2024-03-25 14:26:50.873771
sout2856 atol$ 0.5929670427206106 2024-03-25 14:26:51.577019
sanz1248 k:u

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


vera1241 'aram$ 0.29358070289749305 2024-03-25 14:27:11.704043
vera1241 ^kaka$ 0.3108796830920578 2024-03-25 14:27:13.670769
vera1241 ^dar$ 0.4034210992159133 2024-03-25 14:27:14.971208
sanz1248 ^qili 0.37772552151113414 2024-03-25 14:27:20.032745
vera1241 ^'uwame 0.32249790754601715 2024-03-25 14:27:20.362375
sanz1248 ^kh'al 0.3968536983934171 2024-03-25 14:27:21.249423
kama1351 ^al 0.4174240726521786 2024-03-25 14:27:22.374143
vera1241 ^vag 0.2489925934352637 2024-03-25 14:27:23.241779
sout2856 egsaki$ 0.09066688616500496 2024-03-25 14:27:23.921862
taba1259 ^ru?r 0.15645308692468907 2024-03-25 14:27:24.704264
vera1241 ^masogi 0.3097002153521349 2024-03-25 14:27:24.835583
kama1351 ^azitt@$ 0.5042065315599769 2024-03-25 14:27:24.850491
sout2856 ^stokwip$ 0.41083564351603774 2024-03-25 14:27:25.259752
kama1351 g@?$ 0.6911290806000381 2024-03-25 14:27:26.028737
sout2856 ^nait$ 0.5045439141101467 2024-03-25 14:27:26.513274
kama1351 tob 0.30335279286440753 2024-03-25 14:27:27.329582
sout28

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


vera1241 ^raw$ 0.14442505217701218 2024-03-25 14:27:29.179402
sout2856 ^naflak$ 0.2632448501015603 2024-03-25 14:27:29.391616
kama1351 ^jama?i$ 0.4302640130676726 2024-03-25 14:27:29.589466
sout2856 ^naul$ 0.27959788794884544 2024-03-25 14:27:30.910329
vera1241 ^qo$ 0.2896594262650335 2024-03-25 14:27:31.690349
kama1351 ^i?b 0.25116102130402207 2024-03-25 14:27:32.176301
taba1259 ^duf 0.13795919358975184 2024-03-25 14:27:32.525067
vera1241 ^too$ 0.27962890398893925 2024-03-25 14:27:33.362841
kama1351 ^sa:m 0.6685951917453505 2024-03-25 14:27:33.633402
taba1259 ^raf 0.15086541084412963 2024-03-25 14:27:34.012392
vera1241 ^wak 0.6769222103574976 2024-03-25 14:27:34.634201
kama1351 ^sam 0.42215454738654745 2024-03-25 14:27:34.733449


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


vera1241 ^wor 0.215923240144402 2024-03-25 14:27:36.457506
even1259 rgani 0.22113471533601048 2024-03-25 14:27:36.474689
orko1234 ^ruen$ 0.08291699106928041 2024-03-25 14:27:37.096725
kama1351 ^kubi?i$ 0.29172392024611793 2024-03-25 14:27:37.317692
vera1241 ^buskat$ 0.3827028863464771 2024-03-25 14:27:37.838175
taba1259 ^jiz$ 0.24736985487880503 2024-03-25 14:27:38.273919


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


cash1254 ^nane$ 0.35437252692148646 2024-03-25 14:27:40.502528
cash1254 ^mutsa 0.29965883336448407 2024-03-25 14:27:41.746809
vera1241 ^bo 0.17014505206430652 2024-03-25 14:27:41.896049
even1259 ^tegemi 0.34089617174736475 2024-03-25 14:27:41.905947
orko1234 ^tuwu$ 0.41054455598258466 2024-03-25 14:27:42.668715
cash1254 ^dake$ 0.5735795815648574 2024-03-25 14:27:42.982006
even1259 girku 0.31542342151039016 2024-03-25 14:27:44.093957
cash1254 ^netsu 0.33583526950353904 2024-03-25 14:27:44.169113
orko1234 ^lebak$ 0.42629601022313446 2024-03-25 14:27:44.258465
lowe1385 ^gron 0.4144352361094952 2024-03-25 14:27:44.810571
vera1241 ^'aluwo$ 0.3991906314312189 2024-03-25 14:27:45.073415
cash1254 ^mae$ 0.18498433657105884 2024-03-25 14:27:48.351072
vera1241 itok$ 0.43367945634590865 2024-03-25 14:27:49.056925
cash1254 ^mex 0.3479900647128341 2024-03-25 14:27:51.066663
vera1241 ^we 0.30988701320222833 2024-03-25 14:27:51.225591
lowe1385 men 0.22625528039461418 2024-03-25 14:27:51.585117
kark125

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 ^kkalkk 0.507827309959968 2024-03-25 14:28:04.195264
vera1241 ^raga$ 0.7701788639157265 2024-03-25 14:28:04.239557
vera1241 ^lie$ 0.23850998590738526 2024-03-25 14:28:05.834740


Error: TridiagEigen: eigen decomposition failed
Execution halted


cash1254 ^kaman$ 0.41468680910177724 2024-03-25 14:28:07.322646
vera1241 ^bur$ 0.3570576612760762 2024-03-25 14:28:08.166575
kama1351 ^kubi 0.42228346316732757 2024-03-25 14:28:08.592605
sanz1248 ^?a?c 0.3824471280059004 2024-03-25 14:28:09.015336
sanz1248 iach 0.5862403942202155 2024-03-25 14:28:10.429039
kama1351 ^kuli 0.2225699306144595 2024-03-25 14:28:11.188513
vera1241 ^nev$ 0.35360504925514 2024-03-25 14:28:11.434149
kama1351 ^kuio 0.31633245681003697 2024-03-25 14:28:12.507837
sanz1248 ^luk 0.18212792175408354 2024-03-25 14:28:13.315542
vera1241 ^vovo'$ 0.1619471900067494 2024-03-25 14:28:13.654170
kama1351 nd@r 0.41171897790433 2024-03-25 14:28:13.683234
kama1351 ^det 0.9932759740838293 2024-03-25 14:28:14.743078
sanz1248 illa 0.27616452708472794 2024-03-25 14:28:15.569702
kama1351 ^mei 0.24557486875256385 2024-03-25 14:28:16.839497
sanz1248 ^g'an 0.2810373155859248 2024-03-25 14:28:17.143117
sanz1248 ^ruc 0.473982849632826 2024-03-25 14:28:18.465361
vera1241 ^gunu 0.462292921

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


even1259 -do$ 0.22286405512851437 2024-03-25 14:31:28.745360
vera1241 ^som$ 0.2366936986372321 2024-03-25 14:31:34.552944
vera1241 ^womomo 0.8218043890362619 2024-03-25 14:31:35.680777
sanz1248 ^ikht 0.20857956615672402 2024-03-25 14:31:36.600370
vera1241 ^bin 0.3327372207418583 2024-03-25 14:31:37.272711
even1259 ^emev 0.19759812722692993 2024-03-25 14:31:37.724967
vera1241 ^lan 0.20894862000961267 2024-03-25 14:31:38.915649
even1259 ^emep 0.2873996892367833 2024-03-25 14:31:39.155068
kama1351 ^d'ala 0.2683816713172208 2024-03-25 14:31:39.528601
vera1241 ^kurkur$ 0.40619382973945317 2024-03-25 14:31:40.081010
kama1351 d'an$ 0.27781247150805644 2024-03-25 14:31:40.722647
teop1238 ^ka 0.1710242846528689 2024-03-25 14:31:42.090447
kama1351 ^bun@$ 0.42091462393290324 2024-03-25 14:31:42.282729
even1259 ^ulgu 0.2742012487832909 2024-03-25 14:31:43.228311
kama1351 ^d'aga 0.5049261369390305 2024-03-25 14:31:43.509228
vera1241 ^sisidin$ 0.3297052918919492 2024-03-25 14:31:45.457464
even1259 ^

Error: TridiagEigen: eigen decomposition failed
Execution halted


kama1351 @gam$ 0.34543623282820657 2024-03-25 14:32:10.464891
sanz1248vera1241  ^khkhul^'ev'e  0.26684958824562790.26825929678765337  2024-03-25 14:32:11.5787432024-03-25 14:32:11.578661

kama1351 ^mimbi$ 0.38335827070841666 2024-03-25 14:32:11.834104
vera1241 ^dudu$ 0.3681865606003395 2024-03-25 14:32:13.318298
even1259 ^buru 0.2775327576393962 2024-03-25 14:32:13.431355
sanz1248 ^duc 0.35655316399161774 2024-03-25 14:32:14.408098


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


svan1243 ^ceur 0.37442273146471705 2024-03-25 14:32:15.710074
kama1351 ^ming 0.29547434860526267 2024-03-25 14:32:15.742097
vera1241 ^gaga 0.17156490684041936 2024-03-25 14:32:15.837326
sanz1248 ^rurssi 0.5030026391619868 2024-03-25 14:32:15.921874
svan1243 ^uesap 0.4008461877322871 2024-03-25 14:32:16.871353
kama1351 n'ergo 0.3398651729367652 2024-03-25 14:32:17.170454
sanz1248 ^urkh:ab$ 0.33670633835488173 2024-03-25 14:32:17.676946
vera1241 ^se'$ 0.32932600858595507 2024-03-25 14:32:18.034609
kama1351 ^eju 0.7405873024704123 2024-03-25 14:32:18.287779
sanz1248 ^maladu 0.3527601837213905 2024-03-25 14:32:19.702892
vera1241 ubu 0.298867028382082 2024-03-25 14:32:20.257693
kama1351 ^ozer 0.35714083652629025 2024-03-25 14:32:22.416579
sanz1248 ^g'u 0.3045343486969785 2024-03-25 14:32:22.475235
kama1351 ^kamn 0.2911651928532768 2024-03-25 14:32:23.675509
sanz1248 is:u 0.3814745842410219 2024-03-25 14:32:24.332517
kama1351 ^kaml 0.3294262546959107 2024-03-25 14:32:25.484778
sanz1248 ^wa?w

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


kama1351 ^numo 0.3322975784291581 2024-03-25 14:32:47.194919
vera1241 ^woqa'a 0.6792214902593905 2024-03-25 14:32:47.235194
teop1238 ^ma$ 0.113027505361877 2024-03-25 14:32:48.441060
vera1241 ^wov 0.3620138745641869 2024-03-25 14:32:48.555928
sanz1248 ^deb 0.12653997412732143 2024-03-25 14:32:48.629309
kama1351 ^maja 0.23404878354808456 2024-03-25 14:32:49.323469
vera1241 ^milinsal$ 0.44841548294496447 2024-03-25 14:32:49.769530
sanz1248 ^k'ar 0.3608793069601778 2024-03-25 14:32:50.013587
teop1238 ^tavus 0.12302610633803601 2024-03-25 14:32:50.073292
vera1241 ^'uwamere$ 0.4653614036336491 2024-03-25 14:32:50.930252
sanz1248 mg' 0.5857528036420652 2024-03-25 14:32:51.143027
kama1351 ^gor 0.3601897175210277 2024-03-25 14:32:51.239952
teop1238 mau$ 0.23164131691236012 2024-03-25 14:32:51.263989
vera1241 momse$ 0.31584956924726293 2024-03-25 14:32:52.078575
kama1351 ^d'elamd@$ 0.4835128183258903 2024-03-25 14:32:52.447030
teop1238 ^vahus 0.2558117303855405 2024-03-25 14:32:52.487424
sanz12

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 ^bartst 0.3510405205062792 2024-03-25 14:32:54.360213
nngg1234 ^!'haun 0.4755740040714601 2024-03-25 14:32:55.051016
vera1241 aq$ 0.5752276041977042 2024-03-25 14:32:55.126698
teop1238 ^voos 0.5816530936211637 2024-03-25 14:32:55.144822
teop1238 koto$ 0.26243836332878767 2024-03-25 14:32:56.458712
teop1238 ^egu$ 0.30667514985668143 2024-03-25 14:32:56.605794
sanz1248 nk1 0.3477953024153908 2024-03-25 14:32:56.831947
sanz1248 ^bec 0.5603350891000369 2024-03-25 14:32:57.987768
vera1241 ^rere$ 0.27272229295454453 2024-03-25 14:32:58.040247
lowe1385 star 0.24663959015173953 2024-03-25 14:32:59.238925
sanz1248 ^ssik 0.41168514070035456 2024-03-25 14:32:59.276358
vera1241 sin$ 0.27737504141950564 2024-03-25 14:32:59.481228
sanz1248 ^tukhum 0.27541051053195476 2024-03-25 14:33:00.684557
vera1241 ^'ibie$ 0.4059776433204788 2024-03-25 14:33:00.773557
vera1241 ^meter 0.8131865699489499 2024-03-25 14:33:01.959989
sanz1248 matsts 0.2552229164840155 2024-03-25 14:33:02.702303


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


nngg1234 ^ain$ 0.31787020867834415 2024-03-25 14:33:03.944673
vera1241 egen$ 0.4480801881019575 2024-03-25 14:33:04.287303
sanz1248 rm 0.18386245921576783 2024-03-25 14:33:05.012898
vera1241 ^le$ 0.40408719147541927 2024-03-25 14:33:05.561734
sanz1248 ^wac'ac 0.2914208331246517 2024-03-25 14:33:06.328072
vera1241 ^ruso$ 0.23265121696541158 2024-03-25 14:33:07.513268


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sanz1248 anda$ 0.28887297251482646 2024-03-25 14:33:08.658954
vera1241 ^'o$ 0.2992321922865304 2024-03-25 14:33:08.942773
sanz1248 ^q:ap 0.4796656825103338 2024-03-25 14:33:09.829152
vera1241 'aman$ 0.43705631204338624 2024-03-25 14:33:10.264152
sanz1248 ^k'v 0.36155785615231784 2024-03-25 14:33:13.358584
vera1241 ^malare$ 0.2743594161977596 2024-03-25 14:33:14.301408
vera1241 ^busu 0.316906846660467 2024-03-25 14:33:15.841617
vera1241 ^wobin$ 0.4678478261040008 2024-03-25 14:33:17.171047
vera1241 ^nanara$ 0.7490999292065383 2024-03-25 14:33:18.351789
vera1241 ^me'esala$ 0.27471072936222884 2024-03-25 14:33:19.900526
vera1241 ^'ul$ 0.17815201267418213 2024-03-25 14:33:21.676531
vera1241 ^lalne'ak$ 0.42748016440693315 2024-03-25 14:33:22.937465
vera1241 ^lavet$ 0.5548603078540297 2024-03-25 14:33:24.292822
vera1241 ^gis$ 0.18796527697818155 2024-03-25 14:33:24.422975
sanz1248 ^kkvibakh'nala$ 0.15819938049466797 2024-03-25 14:33:24.947972
vera1241 ^seqeg$ 0.5056615472081537 2024-03-25 14

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


nngg1234 ^|aa 0.33165869265576786 2024-03-25 14:38:15.463993
cash1254 ^tad 0.5368741357078076 2024-03-25 14:38:15.562624
nngg1234 ^|oa 0.3735294291215293 2024-03-25 14:38:16.631193
cash1254 ^disi 0.21076655732981908 2024-03-25 14:38:16.708120
nngg1234 g|=aru 0.39698096085399026 2024-03-25 14:38:17.702944
lowe1385 ^bom 0.3748472599082868 2024-03-25 14:38:17.838965
cash1254 ^ikidan 0.1866235294812124 2024-03-25 14:38:17.950162
nngg1234 ^ska 0.8147786020383453 2024-03-25 14:38:18.654454
cash1254 ^bekan 0.31154063501215057 2024-03-25 14:38:19.030183
lowe1385 ^pjenk 0.370901637271871 2024-03-25 14:38:19.056894
nngg1234 ^kiin 0.5911104178804978 2024-03-25 14:38:19.704492
nngg1234 ^g|uu 0.2571047720290689 2024-03-25 14:38:22.736025
nngg1234 ^tyaa$ 0.37507857400931166 2024-03-25 14:38:24.272124
cash1254 ^iyu 0.15137929194756417 2024-03-25 14:38:25.936421
lowe1385 ^der$ 0.1795584823114681 2024-03-25 14:38:26.497346
nngg1234 ^dyaba$ 0.23451150442201885 2024-03-25 14:38:27.291982
lowe1385 ^motor$

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


teop1238 ikira$ 0.29192159220335245 2024-03-25 14:39:29.835955
vera1241 lak 0.26736812671061083 2024-03-25 14:39:32.359310
vera1241 suwo$ 0.31243946185955895 2024-03-25 14:39:33.429548
vera1241 ^bul 0.3362109984149041 2024-03-25 14:39:34.723232
vera1241 vulu 0.4037395672958848 2024-03-25 14:39:35.917323
vera1241 ^muru 0.41986373401679034 2024-03-25 14:39:36.952595
nngg1234 ^tsaa 0.44108520256162265 2024-03-25 14:39:39.764391
nngg1234 x'ora$ 0.32282980508371406 2024-03-25 14:39:40.989325
nngg1234 ^|xoo 0.3362246422666072 2024-03-25 14:39:42.185373
teop1238 ^bon$ 0.3518412015900697 2024-03-25 14:39:48.620090
teop1238 ^peho$ 0.4033214280401586 2024-03-25 14:39:49.773238
lowe1385 ^dom 0.3684926045481387 2024-03-25 14:39:51.202022
teop1238 ^nabunuu$ 0.3228127296191654 2024-03-25 14:39:55.042949
teop1238 ^aba 0.3007361997803415 2024-03-25 14:40:02.972721
teop1238 ^tootoo$ 0.6239398463897889 2024-03-25 14:40:04.162699
apah1238 ^wa 0.15367892661090488 2024-03-25 14:40:06.894888
lowe1385 ^chrom

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


apah1238 ^iri 0.34591068736783437 2024-03-25 14:40:35.054032
nngg1234 ^@unn$ 0.3128240162401452 2024-03-25 14:40:36.136528
cash1254 ^dais 0.4070433120088809 2024-03-25 14:40:36.417910
teop1238 ^naono$ 0.37267063396814126 2024-03-25 14:40:36.609091
teop1238 ^bai$ 0.4291562830217762 2024-03-25 14:40:37.733794
nngg1234 ^|=oa$ 0.35113921254084757 2024-03-25 14:40:37.749721
teop1238 ^pauna$ 0.44893599114308813 2024-03-25 14:40:39.528686
nngg1234 ^|'huun 0.13126383526725283 2024-03-25 14:40:40.319139
teop1238 ^mosi$ 0.2543397758583571 2024-03-25 14:40:41.025283
lowe1385 ^nan 0.39350584310401915 2024-03-25 14:40:41.144850
nngg1234 ^!qx'abesi$ 0.7088003922293711 2024-03-25 14:40:41.252095
teop1238 ^pee 0.41921912093402636 2024-03-25 14:40:42.108660
nngg1234 ^room 0.8196321965510396 2024-03-25 14:40:42.256993
nngg1234 n!oon$ 0.42337183465077444 2024-03-25 14:40:43.361478
teop1238 ^kuruku 0.2493157058094404 2024-03-25 14:40:43.489238
nngg1234 ^tcunn$ 0.3211013359422631 2024-03-25 14:40:44.489675

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


teop1238 ^teitei$ 0.29462629068335 2024-03-25 14:41:06.696696
nngg1234 ^|'arusi$ 0.3012707615973341 2024-03-25 14:41:07.364368
nngg1234 ^tier$ 0.24110023944069447 2024-03-25 14:41:08.619687
vera1241 ^da 0.3454657057030148 2024-03-25 14:41:09.248336
teop1238 beera$ 0.35755039963468627 2024-03-25 14:41:12.413055
vera1241 ^vunu 0.39588236197218707 2024-03-25 14:41:13.658649
teop1238 hutate$ 0.3650787245653305 2024-03-25 14:41:15.172535
lowe1385 ^wojn 0.15356501883123486 2024-03-25 14:41:15.519934
lowe1385 ^wjas 0.37514894072873717 2024-03-25 14:41:16.636697
teop1238 ^tete$ 0.41151765259842343 2024-03-25 14:41:17.492528
lowe1385 ^farar 0.6098818446353769 2024-03-25 14:41:17.640683
cash1254 ^txai 0.27119970438246177 2024-03-25 14:41:18.307180
teop1238 ^overe$ 0.2886666784857824 2024-03-25 14:41:18.780416
lowe1385 ola 0.3631687107357423 2024-03-25 14:41:18.938764
cash1254 uma 0.36256843550927687 2024-03-25 14:41:19.355288
vera1241 ^serege$ 0.15603563878934357 2024-03-25 14:41:20.359073
lowe1

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


nngg1234 ^!abi$ 0.276828982555242 2024-03-25 14:43:00.068724
vera1241 ^vuva$ 0.3632076269423674 2024-03-25 14:43:00.722591
teop1238 ^kasuana$ 0.27400636757571306 2024-03-25 14:43:00.907901
nngg1234 ^n!ari$ 0.33494750415656904 2024-03-25 14:43:01.098685
lowe1385 ^famili 0.2685465156394331 2024-03-25 14:43:01.587934
nngg1234 ^|=anu$ 0.22782184340941958 2024-03-25 14:43:02.395434
lowe1385 ^suse 0.3969284229283554 2024-03-25 14:43:02.651676
vera1241 'ar$ 0.17124144834979627 2024-03-25 14:43:02.819783
cash1254 ^iwe 0.3579381958414095 2024-03-25 14:43:03.309538
vera1241 ^'es$ 0.24928433793267868 2024-03-25 14:43:03.810304
lowe1385 zek 0.2294045447518137 2024-03-25 14:43:04.208125
cash1254 ^bexu 0.3173317966686444 2024-03-25 14:43:04.424542
lowe1385 ^spokojom$ 0.45410527490980046 2024-03-25 14:43:05.208657
cash1254 ^bei 0.29104240332734355 2024-03-25 14:43:06.245925
lowe1385 gotowa 0.1966647718966833 2024-03-25 14:43:06.954882
apah1238 ^ehe 0.33624401028257656 2024-03-25 14:43:06.976131
apah1

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


apah1238 ^fano 0.4272477779415893 2024-03-25 14:50:46.452961
nort2875 ^araw$ 0.29861199130093363 2024-03-25 14:50:49.432214
apah1238 ^hubungan$ 0.3630138730505549 2024-03-25 14:50:49.597750
apah1238 ^yuhareg$ 0.16713146908630827 2024-03-25 14:50:56.940560
apah1238 ^sekolah$ 0.40592288738906024 2024-03-25 14:50:57.934246
apah1238 ^nare-nare$ 0.22979119251919855 2024-03-25 14:50:59.078069
nort2875 de?el$ 0.34721714634012113 2024-03-25 14:50:59.345165
nort2875 ^ginto 0.4168384838590232 2024-03-25 14:51:00.251398
apah1238 ^wenenggak$ 0.3396590497261951 2024-03-25 14:51:02.350104
apah1238 huk 0.33759661446127986 2024-03-25 14:51:03.258054
apah1238 ^kok$ 0.23522536382599046 2024-03-25 14:51:04.083239
apah1238 ^ket$ 0.2525067307463179 2024-03-25 14:51:20.605818
apah1238 ^sekeliling$ 0.27590754831732844 2024-03-25 14:51:21.973460
apah1238 ^mungkin$ 0.491614922647441 2024-03-25 14:51:22.793997
nort2875 dan$ 0.30554220646180985 2024-03-25 14:51:26.964095
apah1238 ^yabuk$ 0.3020467193129184 2024-

Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sout2856 ^atua$ 0.25482718858717945 2024-03-25 15:01:09.997490
sout2856 ^runomser$ 0.22599746783189212 2024-03-25 15:01:12.008139


Error in fun(A, k, nu, nv, opts, mattype = "matrix") : 
  nrow(A) and ncol(A) should be at least 3
Calls: logisticSVD -> <Anonymous> -> <Anonymous> -> svds.matrix -> fun
Execution halted


sout2856 ^kemalik$ 0.22054541852143672 2024-03-25 15:01:15.369484
sout2856 ^nanwei$ 0.3431931217153814 2024-03-25 15:01:16.183598
sout2856 arik$ 0.3425708731518451 2024-03-25 15:01:17.250083
sout2856 ^apa$ 0.19894229347553327 2024-03-25 15:01:21.179159
sout2856 ^rufatlasik$ 0.40817154369577724 2024-03-25 15:01:22.221690
sout2856 slat 0.18019646981056647 2024-03-25 15:01:40.418394
sout2856 pa$ 0.1814903867323291 2024-03-25 15:01:44.604132
sout2856 sat 0.2765659388136694 2024-03-25 15:01:45.347315
sout2856 sel$ 0.22943832855322088 2024-03-25 15:01:46.412023
sout2856 pl 0.24517975092438993 2024-03-25 15:01:47.204529
sout2856 saiki$ 0.18918674476760144 2024-03-25 15:01:48.611128
sout2856 san$ 0.24019363027235885 2024-03-25 15:01:49.593586
sout2856 naor$ 0.2557413170051501 2024-03-25 15:01:50.591244
sout2856 ^nel$ 0.1479931273613747 2024-03-25 15:01:54.848723
sout2856 auswen$ 0.38039710661965254 2024-03-25 15:01:56.899140
sout2856 ^nafs 0.5680763799237447 2024-03-25 15:01:57.708542
sout2856

OSError: Cannot save file into a non-existent directory: 'pickles/pcaspaces/termspace_goro1270_^na'

## analysis

In [21]:
def get_ig(sorted_terms):
    H = entropy(list(Counter(sorted_terms).values()))
    max_ig = None, 0
    for i in range(1,len(sorted_terms)-1):
        A,B = sorted_terms[:i], sorted_terms[i:]
        Ha, Hb = entropy(list(Counter(A).values())), entropy(list(Counter(B).values()))
        IG = H - ((Ha * (i/len(sorted_terms))) + (Hb * (1-i/len(sorted_terms))))
        if IG > max_ig[1]: max_ig = i, IG
    return max_ig[1]

In [22]:
concreteness_file = '../metaphor/bert_exploration/13428_2013_403_MOESM1_ESM.xlsx'
conc_data = pd.read_excel(concreteness_file)
conc_dict = {r.Word : r['Conc.M'] for i,r in conc_data.iterrows()}
#
models = { 'knn@3' : KNeighborsClassifier(n_neighbors=3),
           'knn@5' : KNeighborsClassifier(n_neighbors=5),
           'SVC' : SVC(C=1) }

quantile_boundary_values = [0.10, 0.25, 0.5]
predictor_variables = ['std','range','iqr','log_fpm', 'concreteness'] + ['%d_balance' % (100*q) for q in quantile_boundary_values]

In [84]:
doreco_metadata

Unnamed: 0,Language,Glottocode,iso-639-3,Family,fam_glottocode,Area,Creator,Latitude,Longitude,Archive,...,Audio license,DOI,Gloss,Extended speakers,Extended word tokens,Extended texts,Core speakers,core word tokens,Core texts,Years of recordings in core set
0,Anal,anal1239,anm,Sino-Tibetan,sino1245,Eurasia,"Ozerov, Pavel",24.05,94.28,ELAR,...,CC BY-NC,10.34847/nkl.0dbazp8m,none,12,13015,23,12,13015,23,2015-2016
1,Yali (Apahapsili),apah1238,na,Nuclear Trans New Guinea,nucl1709,Papunesia,"Riesberg, Sonja",-4.08,139.46,TLA,...,CC BY-NC-SA,10.34847/nkl.9d91nkq2,all,14,10191,11,10,7474,8,2013-2017
2,Arapaho,arap1274,arp,Algic,algi1248,North America,"Cowell, Andrew",43.39,-108.81,ELAR,...,CC BY,10.34847/nkl.36f5r1b6,all,8,10407,21,4,4746,11,2005
3,Baïnounk Gubëeher,bain1259,bab,Atlantic-Congo,atla1278,Africa,"Cobbinah, Alexander Yao",12.31,-16.06,ELAR,...,CC BY,10.34847/nkl.a332abw8,some,10,12425,18,9,11477,17,2009-2014
4,Beja,beja1238,bej,Afro-Asiatic,afro1255,Africa,"Vanhove, Martine",17.24,36.67,CorpoAfroas,...,CC BY-NC,10.34847/nkl.edd011t1,all,5,15439,58,5,15439,58,2003-2011
5,Bora,bora1263,boa,Boran,bora1262,South America,"Seifart, Frank",-2.0,-72.26,TLA,...,CC BY,10.34847/nkl.6eaf5laq,all,39,29578,38,6,8431,9,2004-2008
6,Cabécar,cabe1245,cjp,Chibchan,chib1249,North America,"Quesada, Juan Diego and Skopeteas, Stavros and...",9.67,-83.41,TLA,...,CC BY-NC-ND,10.34847/nkl.ebc4ra22,all,15,17961,115,10,10614,39,2011
7,Cashinahua,cash1254,cbs,Pano-Tacanan,pano1259,South America,"Reiter, Sabine",-9.72,-71.17,TLA,...,CC BY,10.34847/nkl.a8f9q2f1,some,3,10043,3,3,10043,3,2006
8,Dolgan,dolg1241,dlg,Turkic,turk1311,Eurasia,"Däbritz, Chris Lasse and Kudryakova, Nina and ...",71.11,94.29,HZSK,...,CC BY-NC,10.34847/nkl.f09eikq3,all,20,18966,25,6,8778,9,1972-2010
9,Evenki,even1259,evn,Tungusic,tung1282,Eurasia,"Kazakevich, Olga and Klyachko, Elena",61.97,94.69,Siberian Lang,...,CC BY,10.34847/nkl.5e0d27cu,all,23,8315,36,23,8315,36,2006-2016


In [90]:
builder = []
for ai,(A, f) in enumerate(zip(new_anchors, new_fields)):
    df_ai = df_sub[f]
    translations = ['%s (%.1f%%)' % (k,100*v/len(df_ai)) for k,v in Counter(df_ai.term).most_common(3)]    
    #
    print(ai, len(df_ai), A[0], translations)
    #
    if next((False for t in df_ai.term.unique() if t in conc_dict),True):
        print('  *no concreteness scores'); continue
    field_concreteness = np.mean([conc_dict[k] for k,v in Counter(df_ai.term).items() if k in conc_dict for i in range(v)])
    #
    try: model = pickle.load(open(pcaspace_dir + 'pcaspace_' + A[0][0] + '_' + A[0][1] + '.p', 'rb'))
    except: continue
    good_docs = [k for k,v in Counter(df_ai.doculect).most_common() if v >= min_n_tokens] # min 20 tokens
    #
    space = model['A'][:,0]
    space = 2 * ((space - space.min()) / (space.max() - space.min())) - 1 # this places the PC space in [-1,1]
    quantile_boundaries = {i : (np.quantile(space, i), np.quantile(space, 1-i)) for i in quantile_boundary_values}
    #
    per_field_builder = []
    for doc in good_docs:
        observed = (df_ai.doculect == doc) & ~pd.isna(df_ai.marker)
        markers = df_ai[observed].marker
        space_doc = space[observed]
        if len(space_doc) < 1: continue
        if min(space_doc) > quantile_boundaries[0.25][0] or max(space_doc) < quantile_boundaries[0.25][1]: continue
        sorted_terms = list(zip(*sorted(zip(markers, space_doc), key = lambda k : k[1])))[0]
        row = {'doculect' : doc, 'field' : ai, 'concreteness' : field_concreteness }
        # PREDICTED VARIABLES:
        row['ig'] = get_ig(sorted_terms)
        #print('  ', doc, space_doc.reshape(-1,1).shape, markers.shape)
        for model_name, model in models.items():
            if len(set(markers)) == 1: row['colexified_%s' % model_name] = 1
            else:
                model.fit(space_doc.reshape(-1,1), markers)
                row['colexified_%s' % model_name] = int(model.predict([[-1]])[0] == model.predict([[+1]])[0])
        # PREDICTOR VARIABLES:
        row['std'] = np.std(space_doc)
        row['range'] = max(space_doc) - min(space_doc)
        row['iqr'] = np.quantile(space_doc, 0.75) - np.quantile(space_doc,0.25)
        row['log_fpm'] = np.log(1e6*len(markers)/tot_wc[doc])
        for q,(lo,hi) in quantile_boundaries.items():
            neg = sorted_terms[:sum(space_doc < lo)]
            pos = sorted_terms[-sum(space_doc >= hi):]
            row['%d_balance' % (100*q)] = 2*min(len(neg),len(pos))/(len(neg)+len(pos))
        per_field_builder.append(row)
    #

    #
    n_colex = sum(r['colexified_SVC'] for r in per_field_builder)
    n_dislex = len(per_field_builder)-n_colex
    if n_colex >= 0 and n_dislex >= 0: # Makes sure there are at least languages that colexify and ones that dislexify
        print('  sufficient variation (%d colex / %d dislex)' % (n_colex, n_dislex))
        mu_std = { m : (np.mean([r[m] for r in per_field_builder]),np.std([r[m] for r in per_field_builder]))
                  for m in predictor_variables }
        for i in range(len(per_field_builder)):
            for m in predictor_variables:
                per_field_builder[i]['offset_' + m] = ((per_field_builder[i][m]-mu_std[m][0])/(mu_std[m][1])) if mu_std[m][1] > 1e-9 else 0
        builder.extend(per_field_builder)
    else: print('  *insufficient variation (%d colex / %d dislex)' % (n_colex, n_dislex))
df_stats = pd.DataFrame(builder)

0 6388 ('anal1239', 'do') ['say (99.1%)', 'think (0.6%)', 'mean (0.1%)']
  sufficient variation (25 colex / 11 dislex)
1 1650 ('anal1239', 'da') ['say (97.4%)', 'mean (0.8%)', 'talk (0.7%)']
  sufficient variation (10 colex / 3 dislex)
2 98 ('anal1239', 'the') ['say (96.9%)', 'talk (2.0%)', 'speak (1.0%)']
  sufficient variation (1 colex / 0 dislex)
3 1258 ('anal1239', '^ju') ['go (96.4%)', 'move (1.5%)', 'get (1.3%)']
  sufficient variation (15 colex / 3 dislex)
4 2756 ('anal1239', 'cho') ['go (96.6%)', 'set (1.2%)', 'stay (0.8%)']
  sufficient variation (24 colex / 14 dislex)
5 1500 ('anal1239', 'vang') ['go (98.3%)', 'move (0.6%)', 'walk (0.5%)']
  sufficient variation (21 colex / 1 dislex)
6 2200 ('anal1239', '^ha') ['go (81.5%)', 'get (8.0%)', 'move (4.9%)']
  sufficient variation (2 colex / 25 dislex)
7 269 ('anal1239', '^vaj') ['go (95.2%)', 'get (1.9%)', 'stay (1.5%)']
  sufficient variation (0 colex / 3 dislex)
8 306 ('anal1239', 'ahung$') ['go (96.4%)', 'get (2.9%)', 'pass (0

In [91]:
print(len(df_stats.field.unique()), len(df_stats))

4679 33843


In [92]:
df_stats_old = df_stats.copy() # backup bc now you're zscoring

In [93]:
for i in predictor_variables + list(map(lambda k : 'offset_' + k, predictor_variables)):
    df_stats[i] = zscore(df_stats[i]) if len(set(df_stats[i])) > 1 else df_stats[i]

In [95]:
from scipy.stats import pearsonr, zscore, ttest_ind
#df_stats = df_stats_old.copy()
cont_dep = ['ig']
for i in predictor_variables + list(map(lambda k : 'offset_' + k, predictor_variables)):
    if len(set(df_stats[i])) == 1: 
        print(i,'insufficient variation');continue
    for d in cont_dep:
        p,r = pearsonr(zscore(df_stats[i]), df_stats[d])
        print(i,d,'%.2f (%.3f)' % (p,r), '<<<' if r < 0.05 else '')
        #sns.regplot(x=df_stats[i], y=df_stats[d])
        #plt.show()

std ig 0.67 (0.000) <<<
range ig 0.50 (0.000) <<<
iqr ig 0.53 (0.000) <<<
log_fpm ig -0.07 (0.000) <<<
concreteness ig 0.09 (0.000) <<<
10_balance ig 0.10 (0.000) <<<
25_balance ig 0.11 (0.000) <<<
50_balance ig 0.07 (0.000) <<<
offset_std ig 0.29 (0.000) <<<
offset_range ig 0.14 (0.000) <<<
offset_iqr ig 0.23 (0.000) <<<
offset_log_fpm ig -0.09 (0.000) <<<
offset_concreteness insufficient variation
offset_10_balance ig 0.09 (0.000) <<<
offset_25_balance ig 0.09 (0.000) <<<
offset_50_balance ig 0.08 (0.000) <<<


In [97]:
dfi = df_stats
lm = ols('ig ~ iqr + log_fpm', dfi).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:                     ig   R-squared:                       0.291
Model:                            OLS   Adj. R-squared:                  0.291
Method:                 Least Squares   F-statistic:                     6932.
Date:                Mon, 25 Mar 2024   Prob (F-statistic):               0.00
Time:                        22:39:08   Log-Likelihood:                 6591.8
No. Observations:               33843   AIC:                        -1.318e+04
Df Residuals:                   33840   BIC:                        -1.315e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.3443      0.001    318.047      0.0

In [99]:
import statsmodels.formula.api as smf
log_reg = smf.logit("colexified_SVC ~ iqr + log_fpm", data=df_stats).fit()
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.621212
         Iterations 5


0,1,2,3
Dep. Variable:,colexified_SVC,No. Observations:,33843.0
Model:,Logit,Df Residuals:,33840.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 25 Mar 2024",Pseudo R-squ.:,0.07303
Time:,23:39:01,Log-Likelihood:,-21024.0
converged:,True,LL-Null:,-22680.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.5066,0.012,-42.092,0.000,-0.530,-0.483
iqr,-0.7202,0.014,-51.771,0.000,-0.747,-0.693
log_fpm,-0.0799,0.012,-6.899,0.000,-0.103,-0.057


In [67]:
bin_dep = ['colexified_knn@3', 'colexified_knn@5', 'colexified_SVC']
for i in predictor_variables + list(map(lambda k : 'offset_' + k, predictor_variables)):
    if len(set(df_stats[i])) == 1: 
        print(i,'insufficient variation\n');continue
    for d in bin_dep:
        zed = zscore(df_stats[i])
        a,b = zed[df_stats[d] == 0], zed[df_stats[d] == 1]
        t,r = ttest_ind(a,b)
        print(i,d,'%.2f (%.3f)' % (t,r), '<<<' if r < 0.05 else '')
        #sns.regplot(x=df_stats[i], y=df_stats[d])
        #plt.show()
    print()

std colexified_knn@3 47.09 (0.000) <<<
std colexified_knn@5 51.74 (0.000) <<<
std colexified_SVC 77.95 (0.000) <<<

range colexified_knn@3 54.80 (0.000) <<<
range colexified_knn@5 51.19 (0.000) <<<
range colexified_SVC 104.04 (0.000) <<<

iqr colexified_knn@3 22.84 (0.000) <<<
iqr colexified_knn@5 29.07 (0.000) <<<
iqr colexified_SVC 35.44 (0.000) <<<

log_fpm colexified_knn@3 7.48 (0.000) <<<
log_fpm colexified_knn@5 9.96 (0.000) <<<
log_fpm colexified_SVC 5.75 (0.000) <<<

concreteness colexified_knn@3 5.34 (0.000) <<<
concreteness colexified_knn@5 5.56 (0.000) <<<
concreteness colexified_SVC 11.87 (0.000) <<<

10_balance colexified_knn@3 17.80 (0.000) <<<
10_balance colexified_knn@5 16.58 (0.000) <<<
10_balance colexified_SVC 28.84 (0.000) <<<

25_balance colexified_knn@3 11.79 (0.000) <<<
25_balance colexified_knn@5 12.31 (0.000) <<<
25_balance colexified_SVC 12.49 (0.000) <<<

50_balance colexified_knn@3 6.44 (0.000) <<<
50_balance colexified_knn@5 7.00 (0.000) <<<
50_balance cole