In [48]:
# imports
import pandas as pd
import numpy as np
from numpy import linalg as LA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
from glob import glob
import os

# Put together list of identities

Note, we focus in the paper only on the identities in our survey (below), but we include identities from a bunch of other papers we consider in case others want to explore those results

Also, we use "policeman" for "police_officer".  

In [49]:
identities = {'adult', 'american', 'arab', 'asian', 'banker', 'black', 'boss',
       'boy', 'brother', 'bully', 'child', 'coach', 'conservative',
       'consultant', 'cop', 'criminal', 'daughter', 'democrat', 'dentist',
       'doctor', 'executive', 'father', 'girl', 'goon', 'guy', 'hispanic',
       'idiot', 'intern', 'jerk', 'judge', 'kid', 'lady', 'lawyer',
       'liberal', 'man', 'minor', 'mother', 'nurse', 'patient',
       'physician', 'police_officer', 'politician', 'principal', 'punk',
       'republican', 'scientist', 'secretary', 'senator', 'sister', 'son',
       'surgeon', 'teenager', 'thug', 'toddler', 'white', 'witness',
       'woman'}
identities.remove("police_officer")
identities.add('policeman')

In [50]:
uga_identities = [x.strip() for x in open("./uga_identities.txt")]
identities |= set(uga_identities)

In [51]:
uga_identities = [x.strip() for x in open("./personality_identities.txt")]
identities |= set(uga_identities)

In [52]:
garg_dat = pd.read_csv("mturk_stereotypes.csv")

In [53]:
identities |= set(garg_dat.occupation)

In [54]:
len(identities)

1371

# Generate dimension & word position measurement combinations



In [55]:
measures = [{
"names" : ['bad','good'],
"group": "evaluation",
"sets": [
['bad', 'awful'],
['good', 'nice']
],
"paper": "this_short",
"is_paired" : True
},

{
"group": "evaluation",
"names" : ['bad','good'],
"sets": [
['bad', 'awful', 'negative', 'terrible', 'worse', 'horrible'],
['good', 'nice', 'positive', 'great', 'better', 'awesome']
],
"paper": "this_long",
"is_paired" : True
},

{
"group": "potency",
"names" : ['weak','strong'],
"sets": [
['powerless', 'little'],
['powerful', 'big']
],
"paper": "this_short",
"is_paired" : True
},

{
"group": "potency",
"names" : ['weak','strong'],
"sets": [
['powerless', 'little','weak','impotent','dominant'],
['powerful', 'big','strong','potent','feeble']
],
"paper": "this_long",
"is_paired" : True
},

{
"group": "activity",
"names" : ['inactive','active'],
"sets": [
['slow', 'quiet', 'inactive'],
['fast', 'noisy', 'active']
],
"paper": "this_short",
"is_paired" : True
},

{
"group": "activity",
"names" : ['inactive','active'],
"sets": [
['slow', 'quiet', 'inactive', 'dead',  'die', 'stopped'],
['fast', 'noisy', 'active', 'alive', 'live', 'moving']
],
"paper": "this_long",
"is_paired" : True
},

{
"group": "age",
"names" : ['young','old'],
"sets": [
['young'],
['old']
],
"paper": "this_short",
"is_paired" : True
},

{
"group": "age",
"names" : ['young','old'],
"sets": [
['young', 'new', 'youthful', 'young'],
['old', 'old', 'elderly', 'aged']
],
"paper": "this_long",
"is_paired" : True
},

{
"group": "gender",
"names" : ['woman','man'],
"sets": [
['female'],
['male']
],
"paper": "this_short",
"is_paired" : True
},

{
"group": "gender",
"names" : ['woman','man'],
"sets": [
['she'],
['he']
],
"paper": "gonen",
"is_paired" : True
},

{
"group": "gender",
"names" : ['woman','man'],
"sets": [
['woman', 'girl', 'she', 'mother', 'daughter', 'gal', 'female', 'her', 'herself', 'Mary'],
['man', 'boy', 'he', 'father', 'son', 'guy', 'male', 'his', 'himself', 'John']
],
"paper": "bolukbasi_words",
"is_paired" : True
},

{
"group": "gender",
"names" : ['woman','man'],
"sets": [
['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna'],
['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
],
"paper": "bolukbasi_names",
"is_paired" : True
},


{
"group": "gender",
"names" : ['woman','man'],
"sets": [
['woman', 'women', 'she', 'her', 'her', 'hers', 'girl', 'girls', 'female', 'feminine'],
['man', 'men', 'he', 'him', 'his', 'his', 'boy', 'boys', 'male', 'masculine']
],
"paper": "kozlowski",
"is_paired" : True
}
]


In [56]:
institutions = ['family','politics','justice','medicine','business','education','religion']
race_ethnicities = ['white','latino','asian','middle eastern','black'] 

measures += [
{ "sets" : [[x] for x in institutions], 
  "names":institutions, 
  "group" :"instituiton",
  "paper" : "this_short",
"is_paired" : False},

{ "sets" : [
    ['family','kinsperson', 'household', 'kinfolk', 'home', 'kinsfolk', 'kin'],
    ['politics', 'government', 'political'],
    ['justice', 'law', 'legal', 'illegal'],
    ['medicine', 'medical', 'prescription', 'medication'],
    ['business', 'job', 'profession', 'career', 'employment', 'occupation', 'clientele'],
    ['education', 'school', 'university', 'instruction', 'teaching'],
    ['religion', 'faith', 'worship']], 
  "names":institutions, 
  "group" :"instituiton",
  "paper" : "this_long",
"is_paired" : False},

{ "sets" : [['white','White'],
             ['hispanic','Hispanic','latino','Latino'],
             ['asian','Asian'],
             ['arab','Arab'],
             ['black','Black']],
  "names":race_ethnicities, 
  "group" :"race_ethnicity",
  "paper" : "this_short",
"is_paired" : False},
    
{ "sets" : [ ['white',    'whites',    'White',   'Whites',   'European', 'Caucasian','Anglo'],
             ['hispanic', 'hispanics', 'Hispanic','Hispanics','Mexican',  'Latino',   'Hispanic'],
             ['asian',    'asians',    'Asian',   'Asians',   'Chinese',  'Japanese', 'Korean'],
             ['arab',     'arabs',     'Arab',    'Arabs',    'Muslim',   'Muslim',   'Muslim'],
             ['black',    'blacks',    'Black',   'Blacks',   'African',  'African',  'Afro']
            ],
  "names":race_ethnicities, 
 "group" :"race_ethnicity",
  "paper" : "koslowski",
"is_paired" : False},   
    
{ "sets" : [ ['white',    'whites',   'european', 'caucasian','anglo'],
             ['hispanic', 'hispanics','mexican',  'latino',   'hispanic'],
             ['asian',    'asians',   'chinese',  'japanese', 'korean'],
             ['arab',     'arabs',    'muslims',   'muslim',   'arab'],
             ['black',    'blacks',   'african',  'african',  'afro']
            ],
  "names":race_ethnicities, 
 "group" :"race_ethnicity",
  "paper" : "koslowski_lowercase",
"is_paired" : False},   

{ "sets" : [ ['white',    'whites',    'White',   'Whites',   'European', 'Caucasian','Anglo'],
             ['hispanic', 'hispanics', 'Hispanic','Hispanics','Mexican',  'latino','latinos','latina','latinas','Latino','Latinos','Latina','Latinas'],
             ['asian',    'asians',    'Asian',   'Asians',   'Chinese',  'Japanese','Korean'],
             ['arab',     'arabs',     'Arab',    'Arabs',    'Muslim',   'Muslims','Muslim','Muslims'],
             ['black',    'blacks',    'Black',   'Blacks',   'African',  'African',]
            ],
  "names":race_ethnicities, 
 "group" :"race_ethnicity",
  "paper" : "this_long",
"is_paired" : False}   
]



# From Personality Paper

In [57]:
measures += [{
"group": "openness",
"names" : ['open','closed'],
"sets": [
["intelligent", "perceptive", "analytical", "reflective", "curious", "imaginative", "creative", "cultured", "refined", "sophisticated"],
["unintelligent", "imperceptive", "unanalytical", "unreflective", "uninquisitive", "unimaginative", "uncreative", "uncultured", "unrefined", "unsophisticated"]
],
"paper": "agarwal",
"is_paired" : True
},
    
 {
"group": "conscientiousness",
"names" : ['conscientious','unconscientious'],
"sets": [
["organized", "responsible", "reliable", "conscientious", "practical", "thorough", "hardworking", "thrifty", "cautious", "serious"],
["disorganized", "irresponsible", "undependable", "negligent", "impractical", "careless", "lazy", "extravagant", "rash", "frivolous"],
],
"paper": "agarwal",
"is_paired" : True
},
    
{
"group": "extroversion",
"names" : ['extroverted','introverted'],
"sets": [
["extroverted", "energetic", "talkative", "enthusiastic", "bold", "active", "spontaneous", "assertive", "adventurous", "sociable"],
["introverted", "unenergetic", "silent", "unenthusiastic", "timid", "inactive", "inhibited", "unassertive", "unadventurous", "unsociable"],
],
"paper": "agarwal",
"is_paired" : True
},
    
{
"group": "agreeableness",
"names" : ['agreeable','unagreeable'],
"sets": [
["warm", "kind", "cooperative", "unselfish", "polite", "agreeable", "trustful", "generous", "flexible", "fair"],
["cold", "unkind", "uncooperative", "selfish", "rude", "disagreeable", "distrustful", "stingy", "inflexible", "unfair"],
],
"paper": "agarwal",
"is_paired" : True
},
    
{
"group": "neuroticism",
"names" : ['neurotic','calm'],
"sets": [
["angry", "tense", "nervous", "envious", "unstable", "discontented", "insecure", "emotional", "guilt-ridden", "moody"],
["calm", "relaxed", "at ease", "not envious", "stable", "contended", "secure", "unemotional", "guilt-free", "steady"],
],
"paper": "agarwal",
"is_paired" : True
}]

# From Kozlowski Paper

In [58]:

measures += [
{
"group": "cultivation",
"names" : ['uncultivated','cultivated'],
"sets": [
['uncultivated', 'uncultured', 'uncivilized', 'discourteous', 'improper', 'rude', 'uncordial', 'informal', 'uncourtly', 'boorish', 'unpolished', 'unrefined', 'incivility', 'uncivil', 'boorishness', 'rudeness', 'loutish', 'unmannerly', 'gruff', 'ungracious', 'unobliging', 'uncultured', 'ungenteel', 'unmannered', 'blunt'],
['cultivated', 'cultured', 'civilized', 'courteous', 'proper', 'polite', 'cordial', 'formal', 'courtly', 'urbane', 'polished', 'refined', 'civility', 'civil', 'urbanity', 'politesse', 'edified', 'mannerly', 'polished', 'gracious', 'obliging', 'cultured', 'genteel', 'mannered', 'polite']
],
"paper": "kozlowski",
"is_paired" : True
},

{
"group": "employment",
"names" : ['employee','employer'],
"sets": [
['employee', 'employees', 'worker', 'worker', 'laborer', 'laborers', 'employee', 'employees', 'proletarian', 'proletariat', 'staff', 'staff', 'employee', 'employees', 'worker', 'workers', 'laborer', 'laborers', 'staff', 'staff'],
['employer', 'employers', 'owner', 'owners', 'industrialist', 'industrialists', 'proprietor', 'proprietors', 'capitalist', 'capitalists', 'manager', 'managers', 'director', 'directors', 'boss', 'bosses', 'foreman', 'foremen', 'supervisor', 'superintendent']
],
"paper": "kozlowski",
"is_paired" : True
},

{
"group": "education",
"names" : ['uneducated','educated'],
"sets": [
['uneducated', 'unlearned', 'ignorant', 'untrained', 'untaught', 'illiterate', 'unschooled', 'untutored', 'unlettered'],
['educated', 'learned', 'knowledgeable', 'trained', 'taught', 'literate', 'schooled', 'tutored', 'lettered']
],
"paper": "kozlowski",
"is_paired" : True
},

{
"group": "status",
"names" : ['low_status','high_status'],
"sets": [
['unprestigious', 'dishonorable', 'lowly', 'uninfluential', 'disreputable', 'commonplace', 'mundane', 'humble', 'prosaic', 'modest', 'commoner', 'unpretentious', 'ordinary', 'lowly', 'common'],
['prestigious', 'honorable', 'esteemed', 'influential', 'reputable', 'distinguished', 'eminent', 'illustrious', 'renowned', 'acclaimed', 'dignitary', 'venerable', 'exalted', 'estimable', 'prominent']
],
"paper": "kozlowski",
"is_paired" : True
},

{
"group": "morality",
"names" : ['immoral','moral'],
"sets": [
['evil', 'immoral', 'bad', 'dishonest', 'sinful', 'vice', 'wicked', 'transgressive', 'unprincipled', 'questionable', 'nefarious', 'corrupt', 'unscrupulous', 'selfish', 'knavish', 'crooked', 'reprehensible', 'impure', 'undignified', 'unholy', 'fiendish', 'villainous', 'guilty', 'indecent', 'unsavory', 'odious', 'unethical'],
['good', 'moral', 'good', 'honest', 'virtuous', 'virtue', 'righteous', 'chaste', 'principled', 'unquestionable', 'noble', 'uncorrupt', 'scrupulous', 'altruistic', 'chivalrous', 'honest', 'commendable', 'pure', 'dignified', 'holy', 'valiant', 'upstanding', 'guiltless', 'decent', 'chaste', 'righteous', 'ethical']
],
"paper": "kozlowski",
"is_paired" : True
},
{
"group": "affluence",
"names" : ['poor','rich'],
"sets": [
['poor', 'poorer', 'poorest', 'poverty', 'destitute', 'needy', 'impoverished', 'economical', 'impecunious', 'inexpensive', 'ruined', 'necessitous', 'skint', 'cheap', 'economical', 'penurious', 'threadbare', 'cheap', 'unmonied', 'indigent', 'threadbare', 'penurious', 'cheap', 'worthless', 'underprivileged', 'bankrupt', 'unprosperous', 'underdeveloped', 'insolvency', 'unsuccessful', 'plain', 'basic', 'disadvantaged', 'squalid', 'valueless', 'beggarly', 'ramshackle', 'indigence', 'insolvent', 'moneyless', 'penniless', 'penury', 'plain', 'indigence'],
['rich', 'richer', 'richest', 'affluence', 'affluent', 'advantaged', 'wealthy', 'costly', 'exorbitant', 'expensive', 'exquisite', 'extravagant', 'flush', 'invaluable', 'lavish', 'luxuriant', 'luxurious', 'luxury', 'moneyed', 'opulent', 'plush', 'luxuriant', 'precious', 'priceless', 'privileged', 'propertied', 'prosperous', 'developed', 'solvency', 'successful', 'sumptuous', 'swanky', 'thriving', 'upscale', 'valuable', 'classy', 'ritzy', 'opulence', 'solvent', 'moneyed', 'rich', 'affluence', 'posh', 'opulence']
],
"paper": "kozlowski",
"is_paired" : True
}
]

# From Garg Paper

In [59]:
measures.append({
"group": "gender",
"names" : ['woman','man'],
"sets": [
["she", "daughter", "hers", "her", "mother", "woman", "girl", "herself", "female", "sister", "daughters", "mothers", "women",
"girls", "femen", "sisters", "aunt", "aunts", "niece", "nieces"],
["he", "son", "his", "him", "father", "man", "boy", "himself", "male", "brother", "sons", "fathers", "men", "boys", "males", "brothers", "uncle",
"uncles", "nephew", "nephews"]],
"paper": "garg",
"is_paired" : True
})




In [60]:
for i, m in enumerate(measures):
    m['ind'] = i

In [61]:
for m in measures:
    assert(len(m['names']) == len(m['sets']))

In [62]:
all_emb_vocab = None
for v in glob("data/embeddings/*.vocab"):
    if all_emb_vocab:
        all_emb_vocab &= {line.strip() for line in open(v)}
    else:
        all_emb_vocab = {line.strip() for line in open(v)}
print(len(all_emb_vocab))

57928


In [63]:
identities_not_in_vocab = [identity for identity in identities if identity not in all_emb_vocab]
print("{} identities not in vocab".format(len(identities_not_in_vocab)))
identities_not_in_vocab[:5]

488 identities not in vocab


['', 'Dutch', 'speech language pathologist', 'malingerer', 'North Korean']

In [64]:
# check all measures have at least one identity in vocab
for m in measures:
    for s in m['sets']:
        k = 0
        for w in s:
            if w not in all_emb_vocab:
                pass
            else:
                k += 1
        assert(k > 0)

# Run

In [65]:
def normalize(wv):
    # normalize vectors
    norms = np.apply_along_axis(LA.norm, 1, wv)
    wv = wv / norms[:, np.newaxis]
    return wv

def load_embeddings_from_np(filename):
    with open(filename + '.vocab', 'r') as f_embed:
        vocab = [line.strip() for line in f_embed]
        
    w2i = {w: i for i, w in enumerate(vocab)}
    wv = np.load(filename + '.wv.npy')

    return vocab, wv, w2i


def load_wo_normalize(space, filename, vocab, wv, w2i):
    vocab_muse, wv_muse, w2i_muse = load_embeddings_from_np(filename)
    vocab[space] = vocab_muse 
    wv[space] = wv_muse
    w2i[space] = w2i_muse
    
def my_cosine(x,y):
    return np.dot(x,y.T)/(LA.norm(x)*LA.norm(y))

def ripa(w,b):
    return w.dot(b)/LA.norm(b)

def garg(w,b):
    return - LA.norm(w-b)

def kozlowski_b(setv,  wv, w2i):
    return np.mean([wv[w2i[y]] - wv[w2i[x]] for x, y in setv],axis=0)
    
def bolukbasi_b(setv,  wv, w2i):
    if len(setv) == 1:
        # don't run PCA if there's only a single direction
        return wv[w2i[setv[0][0]]] - wv[w2i[setv[0][1]]]

    matrix = []
    for x,y in setv:
        center = (wv[w2i[x]] + wv[w2i[y]])/2
        matrix.append(wv[w2i[x]] - center)
        matrix.append(wv[w2i[y]] - center)
    matrix = np.array(matrix)
    pca = PCA(n_components = min(matrix.shape[0],10))
    return pca.fit(matrix).components_[0]

def ripa_kozlowski_fun(setv, identities, norm_wv, wv, w2i):
    b = kozlowski_b(setv, wv, w2i)
    for identity in identities:
        yield  identity, ripa(wv[w2i[identity]], b)

def ripa_bolukbasi_fun(setv, identities, norm_wv, wv, w2i):
    b = bolukbasi_b(setv, wv, w2i)
    for identity in identities:
        yield identity, ripa(wv[w2i[identity]], b)

def kozlowski_fun(setv, identities, norm_wv, wv, w2i):
    b = kozlowski_b(setv, norm_wv, w2i)
    for identity in identities:
        yield identity, my_cosine(norm_wv[w2i[identity]],b)

def bolukbasi_fun(setv, identities, norm_wv, wv, w2i):
    b = bolukbasi_b(setv, norm_wv, w2i)
    for identity in identities:
        yield identity, my_cosine(norm_wv[w2i[identity]], b)
        
def caliksan_fun(setv, identities, norm_wv, wv, w2i):
    for identity in identities:
        x_vals = [my_cosine(norm_wv[w2i[identity]],norm_wv[w2i[x]]) for x,y in setv]
        y_vals = [my_cosine(norm_wv[w2i[identity]],norm_wv[w2i[y]]) for x,y in setv]
        yield identity, (np.mean(x_vals)-np.mean(y_vals))/np.std(x_vals+y_vals)
                            

def group_fun(names, sets, identities, wv, w2i, measure):
     # create mean vectors for each group
    group_mean_vecs = {}
    for i, setv in enumerate(sets):
        v = [wv[w2i[x]] for x in setv if x in w2i]
        group_mean_vecs[names[i]] = np.mean(v,axis=0)
        
    for identity in identities:
        identity_vec = wv[w2i[identity]]
        for i, group_name in enumerate(names):
            group_dist = measure(group_mean_vecs[group_name], identity_vec)
            other_group_dist = sum([measure(group_mean_vecs[other_group_name], identity_vec) 
                                    for other_group_name in names if group_name != other_group_name])
            other_group_dist /= float(len(names)-1)
        
            yield (group_name, measure.__name__, identity, group_dist - other_group_dist )


def garg_fun(names, sets, identities, norm_wv, wv, w2i):
    return group_fun(names, sets, identities, norm_wv, w2i, garg)

def ripa_fun(names, sets, identities, norm_wv, wv, w2i):
    return group_fun(names, sets, identities, wv,      w2i, ripa)


def swinger_fun(names, sets, identities, norm_wv, wv, w2i):
    mean_identity_vec = np.mean([norm_wv[w2i[i]] for i in identities], axis=0)
    
    group_mean_vecs = {}
    for i, setv in enumerate(sets):
        v = [norm_wv[w2i[x]] for x in setv if x in w2i]
        group_mean_vecs[names[i]] = np.mean(v,axis=0)
    universal_group_mean = np.mean([x for x in group_mean_vecs.values()], axis=0)
    
    for group_name, group_vec in group_mean_vecs.items():
        for identity in identities:
            v = np.dot( (group_vec - universal_group_mean), (norm_wv[w2i[identity]] - mean_identity_vec) )
            yield group_name, 'swinger', identity, v
   

In [71]:
def run_measures(f, measures, identities):
    res = []
    vocab = {}
    wv = {}
    w2i = {}

    embedding = os.path.basename(f)[:-7]
    print(embedding)
    f = f[:-7]

    # load unnormalized for ripa
    load_wo_normalize(embedding,f,vocab,wv,w2i)
    emb_vocab = vocab[embedding]
    emb_wv = wv[embedding]
    emb_w2i = w2i[embedding]
    
    if 'gn_glove' in f:
        emb_wv = emb_wv[:,:-1]
    
    # load normalized for other methods
    norm_emb_wv = normalize(emb_wv)
    
    identities = [i for i in identities if i in emb_w2i]
    
    for measure in measures:
        print(measure['ind'])
        
        tmp_sets = measure['sets']
        names = measure['names']
        measure_ind = measure['ind']
        
        # created paired data for the pairwise metrics,
        # assuming there is a default identity (dimension)
        pair_sets = []
        
        # For the default, compare to the culturally-defined opposite (ensured by ordering of measure)
        dlen = min(len(tmp_sets[0]), len(tmp_sets[1]))
        # if paired, we have to make sure that we have all the words in each pair in the vocabulary
        s0 = [(tmp_sets[1][i],tmp_sets[0][i]) 
                  for i in range(dlen) 
                      if tmp_sets[0][i] in emb_w2i and tmp_sets[1][i] in emb_w2i]
        pair_sets.append(s0)
        
        ## compare the rest to the default
        default_set = tmp_sets[0]
        for s in tmp_sets[1:]:
            dlen = min(len(default_set), len(s))
            # if paired, we have to make sure that we have all the words in each pair in the vocabulary
            s0 = [(default_set[i],s[i]) 
                      for i in range(dlen) 
                          if default_set[i] in emb_w2i and s[i] in emb_w2i]
            pair_sets.append(s0)
            
        
        # create for the multi-class metrics
        sets = []
        for s in tmp_sets:
            sets.append([w for w in s if w in emb_w2i])
        
        
        ######### Pairwise metrics setup
        for i, pairset in enumerate(pair_sets):
            for fun in [ripa_kozlowski_fun, 
                        ripa_bolukbasi_fun, 
                        kozlowski_fun, 
                        bolukbasi_fun, 
                        caliksan_fun]:
                res += [(embedding,measure_ind, names[i], fun.__name__) + x for x in 
                            fun(pairset, identities,norm_emb_wv, emb_wv, emb_w2i)]

        ##### Multiclass
        for fun in [garg_fun,
                    ripa_fun,
                    swinger_fun]:
            res += [(embedding,measure_ind) +x for x in 
                        fun(names,sets, identities, norm_emb_wv, emb_wv, emb_w2i)]

    pd.DataFrame(res, columns=['embedding','ind','name',  'fun','identity','value']).to_csv("measurement_results/"+embedding+".csv",index=False)

# Note ...

If you haven't yet pulled down and untarred the embeddings, the next two lines will do that for you!

Otherwise, you can skip them!



In [None]:
! wget 
! tar -xzvf embeddings.tgz

In [75]:
embedding_fils = glob('embeddings/*.npy')

# Note...

Running this in parallel, avoid (or use a smaller value for n_jobs) if it doesn't make sense for you to do so!

In [73]:
res = Parallel(n_jobs=7)(delayed(run_measures)(f, measures,identities) for f in embedding_fils)

In [74]:
d = []
for m in measures:
    meas = {'ind' : m['ind'],
            "name" : m['group'],
            "len_sets" : len(m['sets']),
           "paper" : m['paper'],
            "len_first_set" : len(m['sets'][0])}
    d.append(meas)
        
pd.DataFrame(d).to_csv("measure_info.csv",index=False)