In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

from src.pipeline import run_ambiguity_pipeline
import pandas as pd

words = ["bank", "bark", "peak"]
amb_results = run_ambiguity_pipeline(words, postprocess=True)

tmp = pd.DataFrame(amb_results).T # results to dataframe

tmp['N'] = tmp['norms'].apply(len)
tmp['Dominance'] = tmp['norms'].apply(lambda x: max(x)/sum(x))
tmp['Communities'] = tmp['coms_assigned'].apply(lambda x: list(x.values()))
tmp['Norms'] = tmp['norms'].apply(lambda x: [round(n / sum(x) * 100) for n in x])
tmp['sort_idx'] = tmp['norms'].apply(lambda x: sorted(range(len(x)), key=lambda i: x[i], reverse=True))

# order by decreasing relative frequency
tmp['Norms'] = tmp.apply(lambda r: [r['Norms'][i] for i in r['sort_idx']], axis=1)
tmp['Communities']  = tmp.apply(lambda r: [r['Communities'][i] for i in r['sort_idx']], axis=1)

tmp = tmp.reset_index(names='word')
tmp = tmp[['word', 'ambiguity', 'N', 'entropy_coef', 'disimilarity_coef', 'Dominance', 'Communities', 'Norms']]
df = tmp.rename(columns={'ambiguity': 'Ambiguity', 'entropy_coef': 'Entropy', 'disimilarity_coef': 'Dissimilarity'})
df

# df.to_csv(YOUR_PATH) # uncomment to save ambiguity norms for a set of words of your choice

Note: to be able to use all crisp methods, you need to install some additional packages:  {'bayanpy', 'graph_tool'}
Note: to be able to use all crisp methods, you need to install some additional packages:  {'pyclustering', 'ASLPAw'}


loader - Data loaded successfully from /Users/iiglesias/Documents/LAMBq/data/SWOW-EN.R100.20180827.csv
generator - Loading graph from cache: cache/graph_65df70897d72076194e86023ec39051b5f97fd2a3a13e7822901d17b78f0cd79.pkl
node_embedding - Loading embeddings from cache: cache/embeddings_a5c93c14bb26fd5fc164030a5c3956c09ebbc520be5696e069ef307c43f7ea41_400.pkl
comm_detection - Processing word 1/3: bank
generator - Preprocessing graph for word: bank, k=2.
generator - Pruning edges with weight less than 0.02.
comm_detection - Processing word 2/3: bark
generator - Preprocessing graph for word: bark, k=2.
generator - Pruning edges with weight less than 0.02.
comm_detection - Processing word 3/3: peak
generator - Preprocessing graph for word: peak, k=2.
generator - Pruning edges with weight less than 0.02.


tree  is an outlier in:  ['tree', 'dog', 'bite', 'woof']


Unnamed: 0,word,Ambiguity,N,Entropy,Dissimilarity,Dominance,Communities,Norms
0,bank,0.495364,2,0.577004,0.85851,0.862745,"[[account, card, count, deposit, institution, ...","[86, 14]"
1,bark,0.369568,4,0.430055,0.859352,0.846774,"[[bite, dog, woof], [rough], [loud], [brown]]","[85, 6, 5, 5]"
2,peak,0.0,1,0.0,0.0,1.0,"[[apex, best, climb, height, high, mountain, p...",[100]


## Top ambiguous words

In [17]:
# English
tmp = pd.merge(
    pd.read_csv('../outputs/ambiguity_norms_EN.csv', index_col=0),
    pd.read_csv('./data/regression_data_EN.csv')[['word','LgSUBTLWF']],
    left_on='word', right_on='word'
    )
tmp['frq_bin'] = pd.qcut(tmp['LgSUBTLWF'], q=11, labels=False, retbins=False)
top10_en = tmp[tmp['frq_bin']==10].sort_values(by='Ambiguity', ascending=False).head(10)
top10_en

Unnamed: 0,word,Ambiguity,N,Entropy,Dissimilarity,Dominance,Communities,Norms,LgSUBTLWF,frq_bin
2376,witness,0.870438,2,0.934068,0.931879,0.65,"[['accident', 'court', 'crime', 'judge', 'jury...","[65, 35]",3.419,10
1024,seem,0.797057,6,0.86639,0.919975,0.429688,"[['appear', 'appearance', 'look', 'looks'], ['...","[43, 19, 13, 11, 9, 5]",3.853,10
1168,minute,0.777039,2,0.950338,0.817645,0.630435,"[['clock', 'day', 'hour', 'second', 'time'], [...","[63, 37]",4.285,10
337,case,0.762602,7,0.871763,0.874781,0.241379,"[['book', 'study'], ['briefcase', 'luggage', '...","[24, 23, 21, 20, 5, 3, 3]",4.159,10
2232,sake,0.757986,5,0.787324,0.962736,0.416667,"[['alcohol', 'drink', 'wine'], ['Japan', 'Japa...","[42, 38, 11, 5, 4]",3.515,10
2696,saw,0.752606,5,0.846085,0.889516,0.368098,"[['axe', 'hammer', 'tool', 'tree', 'wood'], ['...","[37, 27, 26, 7, 3]",4.312,10
1364,count,0.737871,3,0.822993,0.896571,0.611511,"[['add', 'math', 'number', 'numbers', 'one'], ...","[61, 27, 12]",3.662,10
771,final,0.732986,2,0.881958,0.83109,0.699454,"[['countdown', 'done', 'end', 'finish', 'finis...","[70, 30]",3.404,10
2220,address,0.731978,5,0.807759,0.906184,0.502825,"[['approach', 'home', 'house', 'location', 'pl...","[50, 21, 15, 11, 3]",3.426,10
2240,doubt,0.731152,6,0.889018,0.822427,0.377778,"[['certainty', 'dubious', 'not sure', 'uncerta...","[38, 24, 14, 10, 9, 6]",3.506,10


In [None]:
# Spanish
tmp = pd.merge(
    pd.read_csv('../outputs/ambiguity_norms_RP.csv', index_col=0),
    pd.read_csv('./data/regression_data_RP.csv')[['word','log_frq']],
    left_on='word', right_on='word'
    )
tmp['frq_bin'] = pd.qcut(tmp['log_frq'], q=11, labels=False, retbins=False)
tmp[tmp['frq_bin']==10].sort_values(by='Ambiguity', ascending=False).head(10)

Unnamed: 0,word,Ambiguity,N,Entropy,Dissimilarity,Dominance,Communities,Norms,log_frq,frq_bin
2022,título,0.841361,3,0.978067,0.860229,0.428571,"[['carrera', 'diploma', 'grado', 'universidad'...","[43, 32, 25]",2.178192,10
4154,congreso,0.822486,2,0.995253,0.826409,0.540541,"[['diputados', 'gobierno', 'leyes', 'nación', ...","[54, 46]",2.148155,10
640,contacto,0.806138,5,0.933965,0.863135,0.376471,"[['celular', 'comunicación', 'número', 'teléfo...","[38, 21, 16, 15, 9]",1.846943,10
1972,marcha,0.79667,5,0.86699,0.918891,0.340909,"[['anda', 'andar', 'camina', 'caminar', 'camin...","[34, 32, 22, 9, 3]",2.044827,10
1169,intención,0.765781,5,0.842662,0.908765,0.492063,"[['deseo', 'ganas', 'querer', 'voluntad'], ['i...","[49, 16, 16, 14, 5]",1.892466,10
2721,liga,0.761948,3,0.826688,0.921687,0.471698,"[['campeonato', 'deporte', 'fútbol', 'suerte']...","[47, 45, 8]",2.074524,10
2614,escala,0.756754,5,0.811865,0.932118,0.473684,"[['alto', 'escalera', 'montaña', 'sube'], ['me...","[47, 26, 16, 5, 5]",1.865682,10
4877,frecuencia,0.75655,6,0.875351,0.864282,0.352941,"[['onda', 'radio', 'ritmo'], ['tiempo'], ['seg...","[35, 25, 20, 8, 6, 6]",1.844184,10
3704,selección,0.750799,5,0.845406,0.888093,0.441558,"[['elección', 'elegir', 'libertad', 'opción'],...","[44, 25, 18, 9, 4]",1.983879,10
3590,federal,0.742155,2,0.894869,0.829345,0.688525,"[['Argentina', 'Artigas', 'Brasil', 'capital',...","[69, 31]",1.909306,10


## Write outputs

In [None]:
# Write ouputs
from src.data.loader import load_data
from src.graph.generator import swow2graph

english_data = load_data('./data/SWOW-EN.R100.20180827.csv')
graph, _ = swow2graph(english_data)
english_words = list(graph.nodes())

amb_results = run_ambiguity_pipeline(english_words, postprocess=True)

tmp = pd.DataFrame(amb_results).T # results to dataframe

tmp['N'] = tmp['norms'].apply(len)
tmp['Dominance'] = tmp['norms'].apply(lambda x: max(x)/sum(x))
tmp['Communities'] = tmp['coms_assigned'].apply(lambda x: list(x.values()))
tmp['Norms'] = tmp['norms'].apply(lambda x: [round(n / sum(x) * 100) for n in x])
tmp['sort_idx'] = tmp['norms'].apply(lambda x: sorted(range(len(x)), key=lambda i: x[i], reverse=True))

# order by decreasing relative frequency
tmp['Norms'] = tmp.apply(lambda r: [r['Norms'][i] for i in r['sort_idx']], axis=1)
tmp['Communities']  = tmp.apply(lambda r: [r['Communities'][i] for i in r['sort_idx']], axis=1)

tmp = tmp.reset_index(names='word')
tmp = tmp[['word', 'ambiguity', 'N', 'entropy_coef', 'disimilarity_coef', 'Dominance', 'Communities', 'Norms']]
df = tmp.rename(columns={'ambiguity': 'Ambiguity', 'entropy_coef': 'Entropy', 'disimilarity_coef': 'Dissimilarity'})
df.head()

# save
# df.to_csv('../outputs/ambiguity_norms_EN.csv')

In [None]:
# Write ouputs
from src.data.loader import load_data
from src.graph.generator import swow2graph

spanish_data = load_data('./data/SWOWRP.R70.20220426.csv')
graph, _ = swow2graph(spanish_data)
spanish_words = list(graph.nodes())

amb_results = run_ambiguity_pipeline(spanish_words, postprocess=True)

tmp = pd.DataFrame(amb_results).T # results to dataframe

tmp['N'] = tmp['norms'].apply(len)
tmp['Dominance'] = tmp['norms'].apply(lambda x: max(x)/sum(x))
tmp['Communities'] = tmp['coms_assigned'].apply(lambda x: list(x.values()))
tmp['Norms'] = tmp['norms'].apply(lambda x: [round(n / sum(x) * 100) for n in x])
tmp['sort_idx'] = tmp['norms'].apply(lambda x: sorted(range(len(x)), key=lambda i: x[i], reverse=True))

# order by decreasing relative frequency
tmp['Norms'] = tmp.apply(lambda r: [r['Norms'][i] for i in r['sort_idx']], axis=1)
tmp['Communities']  = tmp.apply(lambda r: [r['Communities'][i] for i in r['sort_idx']], axis=1)

tmp = tmp.reset_index(names='word')
tmp = tmp[['word', 'ambiguity', 'N', 'entropy_coef', 'disimilarity_coef', 'Dominance', 'Communities', 'Norms']]
df = tmp.rename(columns={'ambiguity': 'Ambiguity', 'entropy_coef': 'Entropy', 'disimilarity_coef': 'Dissimilarity'})
df.head()

# save
# df.to_csv('../outputs/ambiguity_norms_RP.csv')