# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%cd "/content/gdrive/MyDrive/PerfPred/Experiment/lang2vec"
!apt install python3.10-venv
!python3 -m build

/content/gdrive/.shortcut-targets-by-id/1vr6Z8seuUA0zoWaHuosCSZMv_H2Go5KR/PerfPred/Experiment/lang2vec
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  python3-pip-whl python3-setuptools-whl
The following NEW packages will be installed:
  python3-pip-whl python3-setuptools-whl python3.10-venv
0 upgraded, 3 newly installed, 0 to remove and 16 not upgraded.
Need to get 2,473 kB of archives.
After this operation, 2,882 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-pip-whl all 22.0.2+dfsg-1ubuntu0.3 [1,679 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3-setuptools-whl all 59.6.0-1.2ubuntu0.22.04.1 [788 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 python3.10-venv amd64 3.10.12-1~22.04.2 [5,724 B]
Fetched 2,473 kB in 1s (2,740 kB/s)
Selecting previously unselect

In [40]:
from google.colab import auth
from google.auth import default
import gspread as gs
import gspread_dataframe as gsdf
import lang2vec.lang2vec as l2v
import numpy as np
from numpy.linalg import norm
import pandas as pd
from scipy.spatial import distance
from sklearn.preprocessing import normalize

In [4]:
auth.authenticate_user()
creds, _ = default()
gc = gs.authorize(creds)

In [5]:
def write_to_sheet(df: pd.DataFrame, sh: gs.Spreadsheet, name: str=None, index: bool=True) -> None:
  try:
    try:
      if name is None:
        wsh = sh.get_worksheet(0)
      else:
        wsh = sh.worksheet(name)
    except gs.exceptions.WorksheetNotFound:
      wsh = sh.get_worksheet(0).duplicate(insert_sheet_index=len(sh.worksheets()), new_sheet_name=name)
    gsdf.set_with_dataframe(wsh, df, include_index=index, include_column_header=True, resize=True)
    if name is not None:
      wsh.update_title(name)
    # print(f"Wrote to {sh.title}:{name}.")
  except gs.exceptions.APIError:
    print("Sleeping for 60 seconds...", file=sys.stderr)
    time.sleep(60)
    write_to_sheet(df, sh, name, index)

# Features

## General Stuff

In [6]:
feat_sheet = gc.open('l2v features')

In [106]:
hrls = ["eng", "fra"]
lrls = ['kan', 'guj', 'hin', 'sin', 'tam']
# hope = ["afr", "xho", "gle", "yor", "asm"]
all_langs = hrls + lrls + hope

In [8]:
uriel, learned, all = l2v.URIEL_LANGUAGES, l2v.LEARNED_LANGUAGES, l2v.LANGUAGES
print("# of URIEL languages:", len(uriel))
print("# of learned languages:", len(learned))
print("# of all languages:", len(all)) # same as union of uriel & learned
print("# of languages in both URIEL and learned:", len(set(uriel).intersection(learned)))

# of URIEL languages: 3718
# of learned languages: 1017
# of all languages: 4005
# of languages in both URIEL and learned: 730


In [9]:
for lang in langs:
  if lang in uriel and lang in learned:
    print(f"{lang} is in uriel and learned.")
  elif lang in uriel:
    print(f"{lang} is in uriel only.")
  elif lang in learned:
    print(f"{lang} is in learned only.")
  else:
    print(f"{lang} is in neither uriel or learned.")

eng is in uriel only.
fra is in uriel and learned.
kan is in uriel only.
guj is in uriel only.
hin is in uriel and learned.
sin is in uriel only.
tam is in uriel and learned.
afr is in uriel and learned.
xho is in uriel and learned.
gle is in uriel and learned.
yor is in uriel and learned.
asm is in uriel only.


In [10]:
all_feats = l2v.FEATURE_SETS
feat_types = ["syntax", "phonology", "inventory", "genetic", "geographic"]
feats_by_type = {}
for ftype in feat_types[0:3]:
  feats_by_type[ftype] = [feat for feat in all_feats if feat.startswith(ftype)]
feats_by_type["genetic"] = ["fam"]
feats_by_type["geographic"] = ["geo"]
all_feats = []
for ftype in feat_types:
  all_feats += feats_by_type[ftype]

In [11]:
count_df = pd.DataFrame(columns = all_feats, index = ["CODE"] + langs)
for feat in all_feats:
  vals = l2v.get_features(langs, feat, header=True)
  for row in ["CODE"] + langs:
    count_df.loc[row, feat] = len([val for val in vals[row] if val != '--'])

In [None]:
write_to_sheet(count_df, feat_sheet, "count of filled values", index=True)

## Values for ftype and lang

In [None]:
def english_ftype_vectors(lang, ftype, write=True):
  feats = feats_by_type[ftype]
  code = l2v.get_features(lang, feats[0], header=True)['CODE']
  df = pd.DataFrame(index = code, columns = feats)
  for feat in feats:
    df[feat] = l2v.get_features(lang, feat)[lang]
  if ftype in feat_types[0:3]:
    union = "|".join(feats[:-2])
    df[union] = l2v.get_features(lang, union)[lang]
  df.replace("--", pd.NA, inplace=True)
  df = df.astype("Float64")

  if ftype in feat_types[0:3]:
    basic = df.loc[:, feats[0]:feats[-3]]

    df[f'reproduced {ftype}_average'] = basic.mean(axis=1).round(decimals=4)
    orig, repro = df[f'{ftype}_average'], df[f'reproduced {ftype}_average']
    if not orig.eq(repro, fill_value=-1).all():
      print(f'Failed at reproducing {ftype}_average.')

    df[f'reproduced {union}'] = basic.any(axis=1).astype("Int64")
    df.loc[basic.index[basic.isnull().all(axis=1)], f'reproduced {union}'] = pd.NA
    orig, repro = df[union], df[f'reproduced {union}']
    if not orig.eq(repro, fill_value=-1).all():
      print(f'Failed at reproducing {union}.')

  if write:
    write_to_sheet(df, feat_sheet, f"{lang} {ftype}", index=True)
  return df

In [None]:
for ftype in feat_types:
  for lang in hrls:
    english_ftype_vectors(lang, ftype)

# Distances

## Pre-Computed

In [83]:
dist_sheet = gc.open('l2v distances')

In [85]:
dist_types = ['syntactic', 'phonological', 'inventory', 'featural', 'genetic', 'geographic'] # l2v.DISTANCES but in a different order
dist_by_ftype = {'syntax': 'syntactic', 'phonology': 'phonological', 'inventory': 'inventory', 'genetic': 'genetic', 'geographic': 'geographic'}
all_dists = l2v.distance(dist_types, all_langs)

In [86]:
dists_df = pd.DataFrame(index = dist_types, columns = all_langs)
for i, dtype in enumerate(dist_types):
  dists_df.loc[dtype] = all_dists[i][0]
write_to_sheet(dists_df, dist_sheet, "distances to english", index=True)

## Different Definitions of Distance

In [24]:
def compare_dists(ftype, funcs):
  feats = feats_by_type[ftype].copy()
  feats += ["|".join(feats[:-2])]

  feat_vals = {}
  for feat in feats:
    vals = l2v.get_features(hrls, feat)
    vals = np.array([vals["eng"], vals["fra"]])
    if vals.dtype != float:
      indices = np.where((vals[0] != "--") & (vals[1] != "--"))[0]
      vals = vals[:, indices].astype(float)
    if len(vals[0]):
      feat_vals[feat] = vals
  feats = feat_vals.keys()

  series = []
  for name, f in funcs.items():
    tmp = []
    for feat in feats:
      x, y = feat_vals[feat][0], feat_vals[feat][1]
      tmp.append(f(x, y))
    series.append(pd.Series(tmp, index=feats, name=name))

  repro_df = pd.DataFrame(series)
  write_to_sheet(repro_df, dist_sheet, f"eng-fra {dist_by_ftype[ftype]}", index=True)
  return repro_df

In [37]:
funcs = {
    "normalized euclidean": lambda x, y: distance.euclidean(x / norm(x), y / norm(y)),
    "cosine distance": distance.cosine,
    "angular distance": lambda x, y: np.arccos(1 - distance.cosine(x, y)) / np.pi * 2
}

In [38]:
compare_dists("syntax", funcs)

Unnamed: 0,syntax_wals,syntax_sswl,syntax_ethnologue,syntax_knn,syntax_average,syntax_wals|syntax_sswl|syntax_ethnologue
normalized euclidean,0.665898,0.591679,0.0,0.613589,0.627832,0.648464
cosine distance,0.22171,0.175042,0.0,0.188246,0.197087,0.210253
angular distance,0.432175,0.382398,0.0,0.397026,0.406564,0.420423


In [None]:
compare_dists("phonology", funcs)

In [None]:
compare_dists("inventory", funcs)

## Distance Using Different Vector Types

In [138]:
def angular_distance(x, y):
  return 2 / np.pi * np.arccos(round(np.dot(x, y) / (norm(x) * norm(y)), 8))

In [155]:
def find_dists(ftype, langs):
  feats = feats_by_type[ftype].copy()
  feats += ["|".join(feats[:-2])]

  feat_vals = {}
  for feat in feats:
    feat_vals[feat] = l2v.get_features(langs, feat)

  series = []
  for lang in langs:
    tmp = [dists_df.loc[dist_by_ftype[ftype], lang]]
    for feat in feats:
      x, y = np.array(feat_vals[feat]["eng"]), np.array(feat_vals[feat][lang])
      if x.dtype != float or y.dtype != float:
        x, y = x.astype(str), y.astype(str)
        indices = np.where((x != "--") & (y != "--"))[0]
        x, y = x[indices].astype(float), y[indices].astype(float)
      if len(x) == 0 or norm(x) == 0 or norm(y) == 0:
        tmp.append(pd.NA)
      else:
        tmp.append(angular_distance(x, y))
    series.append(pd.Series(tmp, index=["pre-computed"] + feats, name=f"eng-{lang}"))

  repro_df = pd.DataFrame(series)
  repro_df.fillna("N/A", inplace=True)
  write_to_sheet(repro_df, dist_sheet, dist_by_ftype[ftype], index=True)
  return repro_df

In [157]:
find_dists("syntax", all_langs)

Unnamed: 0,pre-computed,syntax_wals,syntax_sswl,syntax_ethnologue,syntax_knn,syntax_average,syntax_wals|syntax_sswl|syntax_ethnologue
eng-eng,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eng-fra,0.46,0.432175,0.382398,0.0,0.397026,0.406564,0.420423
eng-kan,0.64,0.600141,0.598842,0.666667,0.641843,0.588709,0.595294
eng-guj,0.68,0.460107,,,0.603406,0.483457,0.545629
eng-hin,0.59,0.542846,0.583478,0.73802,0.577346,0.553821,0.550821
eng-sin,0.78,0.579167,,0.666667,0.623076,0.562551,0.595887
eng-tam,0.71,0.651553,,0.666667,0.656687,0.651049,0.659402
eng-afr,0.63,0.0,0.432694,,0.353934,0.385927,0.420707
eng-xho,0.76,0.694486,,,0.681242,0.682634,0.682459
eng-gle,0.62,0.523086,0.573405,0.666667,0.488225,0.4983,0.535441


In [158]:
find_dists("phonology", all_langs)

Unnamed: 0,pre-computed,phonology_wals,phonology_ethnologue,phonology_knn,phonology_average,phonology_wals|phonology_ethnologue
eng-eng,0.0,0.0,,0.0,0.0,0.0
eng-fra,0.427,0.457582,,0.457582,0.457582,0.457582
eng-kan,0.3498,0.349802,,0.40537,0.349802,0.349802
eng-guj,0.5687,,,0.40537,,
eng-hin,0.3433,0.34327,,0.427019,0.34327,0.34327
eng-sin,0.4121,0.312506,,0.447678,0.312506,0.312506
eng-tam,0.5687,,,0.349802,,
eng-afr,0.5687,,,0.389964,,
eng-xho,0.5687,,,0.349802,,
eng-gle,0.5687,,,0.34327,,


In [160]:
find_dists("inventory", all_langs)

Unnamed: 0,pre-computed,inventory_ethnologue,inventory_phoible_aa,inventory_phoible_gm,inventory_phoible_saphon,inventory_phoible_spa,inventory_phoible_ph,inventory_phoible_ra,inventory_phoible_upsid,inventory_knn,inventory_average,inventory_ethnologue|inventory_phoible_aa|inventory_phoible_gm|inventory_phoible_saphon|inventory_phoible_spa|inventory_phoible_ph|inventory_phoible_ra|inventory_phoible_upsid
eng-eng,0.0,,,,,0.0,,,,0.0,0.0,0.0
eng-fra,0.4753,,,,,0.486128,,,,0.468829,0.473073,0.475325
eng-kan,0.4738,,,,,,,,,0.473801,0.473801,0.473801
eng-guj,0.4753,,,,,,,,,0.475325,0.475325,0.475325
eng-hin,0.4684,,,,,0.52489,,,,0.498211,0.445222,0.468365
eng-sin,0.4991,,,,,0.51421,,,,0.499138,0.496054,0.499138
eng-tam,0.5014,,,,,,,,,0.542794,0.508792,0.501389
eng-afr,0.5144,,,,,,,,,0.514399,0.514399,0.514399
eng-xho,0.5032,,,,,,,,,0.503215,0.503215,0.503215
eng-gle,0.5193,,,,,0.51934,,,,0.512168,0.500358,0.51934
