# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [2]:
!git clone https://github.com/antonisa/lang2vec
%cd lang2vec
!python3 setup.py install

fatal: destination path 'lang2vec' already exists and is not an empty directory.
/content/lang2vec
running install
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer, pypa/build or
        other standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  sel

In [3]:
from google.colab import auth
from google.auth import default
import gspread
import lang2vec.lang2vec as l2v
import numpy as np
import pandas as pd

In [4]:
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# Distances

In [None]:
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
dists = ['geographic', 'genetic', 'syntactic', 'phonological', 'inventory', 'featural']
data = np.array(l2v.distance(dists, ['eng'] + langs))

In [None]:
cols=["distance"] + [lang + "-eng" for lang in langs]
df = pd.DataFrame(columns=cols)
for i, dist in enumerate(dists):
  vals = data[i,0,1:]
  row = dict(zip(cols, [dist] + list(vals)))
  df.loc[len(df.index)] = row

In [None]:
worksheet = gc.open('Experiment 1 Data').get_worksheet(2)
worksheet.update([df.columns.values.tolist()] + df.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': 'lang2vec!A1:F7',
 'updatedRows': 7,
 'updatedColumns': 6,
 'updatedCells': 42}

# Features

In [19]:
langs = ['eng', 'kan', 'guj', 'hin', 'sin', 'tam']
feats = ['syntax_average', 'phonology_average', 'inventory_average',
         'syntax_knn', 'phonology_knn', 'inventory_knn', 'fam', 'geo']
data = l2v.get_features(langs, '+'.join(feats), header=True)

In [20]:
df1 = pd.DataFrame(data)
df1[df1=='--'] = '-'
worksheet = gc.open('Experiment 1 Data').get_worksheet(3)
worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': "'l2v vals'!A1:G4596",
 'updatedRows': 4596,
 'updatedColumns': 7,
 'updatedCells': 32172}

In [21]:
N = len(df1)
ors = np.zeros(N)
for lang in langs:
  ors = np.logical_or(ors, df1[lang])
inds = np.where(ors)[0]
df2 = df1.iloc[inds]
worksheet = gc.open('Experiment 1 Data').get_worksheet(4)
worksheet.update([df2.columns.values.tolist()] + df2.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': "'l2v relev'!A1:G698",
 'updatedRows': 698,
 'updatedColumns': 7,
 'updatedCells': 4886}

In [25]:
N = len(df2)
ors = np.zeros(N)
ands = np.full(N, 1)
for lang in langs:
  if lang == 'eng':
    continue
  ors = np.logical_or(ors, df2[lang])
  ands = np.logical_and(ands, df2[lang])
inds = np.where((ors != 0) & (ands != 1))[0]
df3 = df2.iloc[inds]
worksheet = gc.open('Experiment 1 Data').get_worksheet(5)
worksheet.update([df3.columns.values.tolist()] + df3.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': "'l2v diff'!A1:G537",
 'updatedRows': 537,
 'updatedColumns': 7,
 'updatedCells': 3759}

In [28]:
df4 = df3.copy()
df4[df4=='-'] = -len(langs)
N = len(df4)
adds = np.zeros(N)
for lang in langs:
  if lang == 'eng':
    continue
  adds = np.add(adds, df4[lang])
inds = np.where(adds >= 0)[0]
df4 = df4.iloc[inds]

worksheet = gc.open('Experiment 1 Data').get_worksheet(6)
worksheet.update([df4.columns.values.tolist()] + df4.values.tolist())

{'spreadsheetId': '1V0fqzJOUg62M1RGGmd2LtKv0ZA76yYkY8Xo0iYaHr5w',
 'updatedRange': "'l2v diff non-na'!A1:G440",
 'updatedRows': 440,
 'updatedColumns': 7,
 'updatedCells': 3080}