# Setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
%cd "/content/gdrive/MyDrive/PerfPred/Experiment 1/lang2vec"
!python3 setup.py install

[Errno 2] No such file or directory: '/content/gdrive/MyDrive/PerfPred/Experiment 1/lang2vec'
/content
python3: can't open file '/content/setup.py': [Errno 2] No such file or directory


In [3]:
from google.colab import auth
from google.auth import default
import gspread
import lang2vec.lang2vec as l2v
import numpy as np
import pandas as pd
import scipy
from pprint import pprint
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_distances

ModuleNotFoundError: ignored

In [None]:
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

# Features

In [None]:
main_features = ['syntax_average', 'phonology_average', 'inventory_average',
               'syntax_knn', 'phonology_knn', 'inventory_knn', 'fam', 'geo']
extra_features = ["+".join(['syntax_average', 'phonology_average', 'inventory_average']),
                  "+".join(['syntax_knn', 'phonology_knn', 'inventory_knn']),
                  "+".join(['syntax_average', 'phonology_average', 'inventory_average', 'fam', 'geo']),
                  "+".join(['syntax_knn', 'phonology_knn', 'inventory_knn', 'fam', 'geo']),
                  "+".join(main_features)]
all_features = main_features + extra_features
N = len(all_features)

In [None]:
def feature_vecs(feats, langs, sheet):
  data = l2v.get_features(langs, '+'.join(feats), header=True)
  df = pd.DataFrame(data)
  worksheet = gc.open('l2v self-calculated distances').get_worksheet(sheet)
  worksheet.update_title('+'.join(feats)[:100])
  worksheet.update([df.columns.values.tolist()] + df.values.tolist())
  return df

In [None]:
dfs = []
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
for i, feat in enumerate(all_features):
  dfs.append(feature_vecs([feat], ["eng"] + langs, i))

In [None]:
def multiple_features(feats, lang, sheet):
  df = None
  for feat in feats:
    data = l2v.get_features(lang, feat, header=True)
    if df is None:
      df = pd.DataFrame(data, columns=["CODE"])
    df[feat] = data[lang]
  worksheet = gc.open('l2v self-calculated distances').get_worksheet(sheet)
  worksheet.update_title("inv " + lang)
  worksheet.update([df.columns.values.tolist()] + df.values.tolist())

In [None]:
inv_features = ['inventory_ethnologue', 'inventory_phoible_aa',
                'inventory_phoible_gm', 'inventory_phoible_saphon',
                'inventory_phoible_spa', 'inventory_phoible_ph',
                'inventory_phoible_ra', 'inventory_phoible_upsid']
# N + 7
multiple_features(inv_features + ["|".join(inv_features)], "hi", N + 7)
# data = l2v.get_features("hi", inv_features, header=True)
# df = pd.DataFrame(data)
# worksheet = gc.open('l2v self-calculated distances').get_worksheet(N + 7)
# worksheet.update_title("inventory")
# worksheet.update([df.columns.values.tolist()] + df.values.tolist())

In [None]:
ret = l2v.get_features(["eng", "hin"], "|".join(inv_features))
eng, hin = ret["eng"], ret["hin"]
print(np.arccos(np.dot(eng / np.linalg.norm(eng), hin / np.linalg.norm(hin))))
print(scipy.spatial.distance.cosine(eng, hin))

0.7357060106224771
0.25864287309665


# Distances

In [None]:
def filter_df(df, langs):
  ret = df.copy()
  for lang in langs:
    ret = ret.loc[ret[lang] != '--']
  return ret

In [None]:
def distances(feats, langs, sheet1, sheet2, val_per_lang):
  cols = ["feature"] + langs
  dist_df = pd.DataFrame(columns=cols)
  num_df = pd.DataFrame(columns=cols)
  for i, feat in enumerate(feats):
    df = dfs[all_features.index(feat)]
    row = {"feature": feat}
    num_row = {"feature": feat}
    all_df = filter_df(df, ["eng"] + langs)
    for lang in langs:
      if val_per_lang:
        val_df = filter_df(df, ["eng", lang])
      else:
        val_df = all_df
      num_row[lang] = len(val_df)
      if len(val_df) == 0 or not np.any(val_df["eng"]) or not np.any(val_df[lang]):
        row[lang] = '--'
        continue
      val = scipy.spatial.distance.cosine(val_df["eng"], val_df[lang])
      row[lang] = np.round(val, decimals=4)
    dist_df.loc[len(dist_df.index)] = row
    num_df.loc[len(num_df.index)] = num_row
  worksheet1 = gc.open('l2v self-calculated distances').get_worksheet(sheet1)
  worksheet1.update([dist_df.columns.values.tolist()] + dist_df.values.tolist())
  worksheet2 = gc.open('l2v self-calculated distances').get_worksheet(sheet2)
  worksheet2.update([num_df.columns.values.tolist()] + num_df.values.tolist())
  return dist_df

In [None]:
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
dist_df = distances(all_features, langs, N, N + 1, True)

In [None]:
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
dist_df = distances(all_features, langs, N + 2, N + 3, False)

In [None]:
eng = dfs[5]["eng"]
kan = dfs[5]["kan"]
np.arccos(np.dot(eng / np.linalg.norm(eng), kan / np.linalg.norm(kan)))

0.744244995555758

In [None]:
dfff = feature_vecs(["|".join(['inventory_ethnologue', 'inventory_phoible_aa',
                              'inventory_phoible_gm', 'inventory_phoible_saphon',
                              'inventory_phoible_spa', 'inventory_phoible_ph',
                              'inventory_phoible_ra', 'inventory_phoible_upsid'])],
                    ["eng"] + langs, N + 7)

In [None]:
l2v.FEATURE_SETS

['syntax_wals',
 'phonology_wals',
 'syntax_sswl',
 'syntax_ethnologue',
 'phonology_ethnologue',
 'inventory_ethnologue',
 'inventory_phoible_aa',
 'inventory_phoible_gm',
 'inventory_phoible_saphon',
 'inventory_phoible_spa',
 'inventory_phoible_ph',
 'inventory_phoible_ra',
 'inventory_phoible_upsid',
 'syntax_knn',
 'phonology_knn',
 'inventory_knn',
 'syntax_average',
 'phonology_average',
 'inventory_average',
 'fam',
 'id',
 'geo',
 'learned']

# Pre-calculated Distances

In [None]:
def pre_calc_distances(feats, langs, sheet):
  cols=["distance"] + langs
  dists = ['geographic', 'genetic', 'syntactic', 'phonological', 'inventory', 'featural']
  data = np.array(l2v.distance(dists, ['eng'] + langs))
  dist_df = pd.DataFrame(columns=cols)
  for i, dist in enumerate(dists):
    vals = data[i,0,1:]
    row = dict(zip(cols, [dist] + list(vals)))
    dist_df.loc[len(dist_df.index)] = row
  worksheet = gc.open('l2v self-calculated distances').get_worksheet(sheet)
  worksheet.update([dist_df.columns.values.tolist()] + dist_df.values.tolist())
  return dist_df

In [None]:
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
dist_df = pre_calc_distances(all_features, langs, N + 5)

# Big Data Files

In [None]:
files = ["FEATURAL", "GENETIC", "GEOGRAPHIC", "INVENTORY", "PHONOLOGICAL", "SYNTACTIC"]
langs = ['kan', 'guj', 'hin', 'sin', 'tam']

In [None]:
for file in files:
  df = pd.read_csv(f"data/distances/{file}.csv", usecols=["G_CODE", "eng"] + langs)
  df = df.loc[df["G_CODE"].isin(["eng"] + langs)]
  df.to_csv(f"data/truncated/{file}.csv")

# Main Sheet

In [None]:
def distance_with_eng_main(feats, langs, sheet):
  cols = ["feature"] + [lang + "-eng" for lang in langs]
  dist_df = pd.DataFrame(columns=cols)
  for i, feat in enumerate(feats):
    df = dfs[all_features.index(feat)]
    row = {"feature": feat}
    for lang in langs:
      val = scipy.spatial.distance.cosine(df["eng"], df[lang])
      row[lang + "-eng"] = np.round(val, decimals=4)
    dist_df.loc[len(dist_df.index)] = row
  worksheet = gc.open('Experiment 1 Data').get_worksheet(sheet)
  worksheet.update([dist_df.columns.values.tolist()] + dist_df.values.tolist())
  return dist_df

In [None]:
langs = ['kan', 'guj', 'hin', 'sin', 'tam']
select_features = ['syntax_knn', 'phonology_knn', 'inventory_knn', 'fam', 'geo',
                  "+".join(['syntax_knn', 'phonology_knn', 'inventory_knn'])]
dist_df = distance_with_eng_main(select_features, langs, 3)
# not this anymore, geo & gen taken from pre-calculated