In this notebook, the Topographic Similarity and Positional Disentanglement metrics are calculated in the [CGames dataset](https://github.com/laurestine/nlp-emergent-languages/blob/main/corporawithmeaning/cgames-combined.csv).

# 1.0 Boot

In [None]:
! pip --quiet install wget

  Building wheel for wget (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import os.path
import wget

In [None]:
# utils
if not os.path.isfile('utils.py'):
    url = 'https://raw.githubusercontent.com/laurestine/nlp-emergent-languages/main/metrics/utils.py?token=ADADH6EGPSURTRZDBD3ZR63BYXTDC'
    wget.download(url)

# Metrics
if not os.path.isfile('topographic_similarity.py'):
    url = 'https://raw.githubusercontent.com/laurestine/nlp-emergent-languages/main/metrics/topographic_similarity.py?token=ADADH6E6LFEKO3L7NOWDNRLBYXS4G'
    wget.download(url)

if not os.path.isfile('positional_disentanglement.py'):
    url = 'https://raw.githubusercontent.com/laurestine/nlp-emergent-languages/main/metrics/positional_disentanglement.py?token=ADADH6AR6W4EJBATMVWCCQLBY6U3Y'
    wget.download(url)

if not os.path.isfile('conflict_count.py'):
    url = 'https://raw.githubusercontent.com/laurestine/nlp-emergent-languages/4c8fd1c8afbe673103d8c277ba94acedcc01214d/metrics/conflict_count.py?token=ADADH6DAWTKRCW2G4OVECY3BYXTKC'
    wget.download(url)

In [None]:
import editdistance  # Levenshtein distance
from scipy.spatial.distance import hamming

from topographic_similarity import TopographicSimilarity
from positional_disentanglement import PositionalDisentanglement
from utils import transform_corpus, get_meaning, add_item_in_list

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# 2.0 Data

In [None]:
if not os.path.isfile('cgames-combined.csv'):
    url = 'https://raw.githubusercontent.com/laurestine/nlp-emergent-languages/main/corporawithmeaning/cgames-combined.csv?token=ADADH6BRD2IDKUHOWXB7W4DBYXQIU'
    wget.download(url)

In [None]:
data = pd.read_csv('cgames-combined.csv')
data.head()

Unnamed: 0,mr,ref,group
0,bluelion_big;lemon_small;eye_small,it's a blue lion on top and a lemon on the lef...,1
1,iron_big;lime_small;nail_small,okay it's a uh it- an iron on top with a green...,1
2,urn_big;bluemoon_small;yellowmoon_small,it's like a Greek pot or an urn on top with a ...,1
3,iron_big;lime_small;nail_small,it's the one before the iron on top the green ...,1
4,alien_big;yellowlion_small;money_small,it's a gray space alien on top with a yellow l...,1


In [None]:
"""
Creating a global vocabulary for meanings matrix
"""

mr = data['mr']
mrs_separeted = mr.apply(lambda x: x.split(';'))
all_mrs = [item for sublist in mrs_separeted for item in sublist]
vocabulary = list(set(all_mrs))

print("Uniques/vocabulary of meanings: {} \n\n{}".format(len(vocabulary), vocabulary[:5]))

Uniques/vocabulary of meanings: 86 

['menorah_big', 'sunflower_small', 'mirror_big', 'oreo_small', 'bluemoon_small']


## 2.1 Get meaning

In [None]:
meanings = get_meaning(data,'mr')
meanings[:5]

[array([71., 13., 27.,  0.]),
 array([50., 57.,  7.,  0.]),
 array([10.,  5., 60.,  0.]),
 array([50., 57.,  7.,  0.]),
 array([62., 76., 69.,  0.])]

# 3.0 Metrics

## 3.1 Topographic similarity

In [None]:
"""
With all groups
"""
meanings = get_meaning(data,'mr')

topsim_class = TopographicSimilarity(message_metric=hamming,
                                     meaning_metric=editdistance.eval)

topsim = topsim_class.measure(meanings, list(data['ref']))

print("Topsim: {} \n".format(topsim))

Topsim: 0.26282936314901656 



In [None]:
topsim_class = TopographicSimilarity(message_metric = hamming,
                                     meaning_metric = editdistance.eval)

groups = data['group'].unique()

for group in groups:
    analyzing_dataset = data[data['group']==group]

    """
    The representation on the test dataset used a 
    symbol/character to each word. But here is different, 
    so I transformed each meaning to count vector matrix.
    """

    meanings = get_meaning(analyzing_dataset,'mr',vocabulary)

    topsim = topsim_class.measure(meanings, list(analyzing_dataset['ref']))

    print("Topsim of group {}: {} \n".format(group, topsim))

Topsim of group 1: 0.48949517712849216 

Topsim of group 2: 0.26775308491865313 

Topsim of group 3: 0.2527720435082594 

Topsim of group 4: 0.5672868357392686 

Topsim of group 5: 0.3208468378148561 

Topsim of group 6: 0.3243020743244586 

Topsim of group 7: 0.25489704641246536 

Topsim of group 8: 0.3022445791533304 

Topsim of group 9: 0.1411018303302465 

Topsim of group 10: 0.2910171902163823 

Topsim of group 11: 0.1499559954195863 

Topsim of group 12: 0.22845175838223178 



## 3.2 Positional disentanglement

In [None]:
import nltk
from nltk.tokenize import word_tokenize

try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [None]:
"""
--With all sentences--
Here is necessary transform the meanings in str, 
because of the calc of mutual information.
Besides, the numbers of all meanings must be the same, and 
length of sentences must be the same too.
"""

meanings = get_meaning(data,'mr')
meanings_ = [list(map(int, mean)) for mean in meanings]
meanings_ = [list(map(str, mean)) for mean in meanings_]

print("Meanings: \n{}".format(meanings_[:5]))

Meanings: 
[['71', '13', '27', '0'], ['50', '57', '7', '0'], ['10', '5', '60', '0'], ['50', '57', '7', '0'], ['62', '76', '69', '0']]


In [None]:
"""
tokenizing sentences and getting lists with equal lengths
"""
n_max = 0
data_token = []
for sentence in data['ref']:
    sentence_token = word_tokenize(sentence)
    n_max = n_max if len(sentence_token)<n_max else len(sentence_token)
    data_token.append(sentence_token)


concepts_ = []
for token in data_token:
    concepts_.append(add_item_in_list(token, '_', n_max))

print("Concepts: \n{}".format(concepts_))

Concepts: 
[['it', "'s", 'a', 'blue', 'lion', 'on', 'top', 'and', 'a', 'lemon', 'on', 'the', 'left', 'and', 'a', 'eye', 'on', 'the', 'right', 'blue', 'lion', "'s", 'on', 'the', 'top', 'and', 'the', 'lemon', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['okay', 'it', "'s", 'a', 'uh', 'it-', 'an', 'iron', 'on', 'top', 'with', 'a', 'green', 'lime', 'on', 'the', 'bottom', 'left', 'and', 'a', 'nail', 'on', 'the', 'bottom', 'right', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'

In [None]:
"""
With all groups
"""

pos_class = PositionalDisentanglement(max_message_length=len(meanings[0]), 
                                      num_concept_slots=n_max)
pos = pos_class.measure(meanings = meanings_, token_messages = concepts_)
print("Pos: {} \n".format(pos))

Pos: 0.024908683973295286 



In [None]:
"""
Separate groups
"""

groups = data['group'].unique()

for group in groups:
    analyzing_dataset = data[data['group']==group]

    """
    In the begin, we need to have the meanings and
    concepts in the correct patterns
    """

    meanings = get_meaning(analyzing_dataset,'mr',vocabulary)
    meanings_ = [list(map(int, mean)) for mean in meanings]
    meanings_ = [list(map(str, mean)) for mean in meanings_]

    n_max = 0
    data_token = []
    for sentence in analyzing_dataset['ref']:
        sentence_token = word_tokenize(sentence)
        n_max = n_max if len(sentence_token)<n_max else len(sentence_token)
        data_token.append(sentence_token)


    concepts_ = []
    for token in data_token:
        concepts_.append(add_item_in_list(token, '_', n_max))

    pos_class_group = PositionalDisentanglement(max_message_length=len(meanings[0]),
                                                num_concept_slots=n_max)
    pos = pos_class_group.measure(meanings = meanings_, token_messages = concepts_)

    print("PosDis of group {}: {} \n".format(group, pos))

PosDis of group 1: 0.06199151732463907 

PosDis of group 2: 0.030146140093257542 

PosDis of group 3: 0.02490609570147575 

PosDis of group 4: 0.03872325573106898 

PosDis of group 5: 0.020184265608056903 

PosDis of group 6: 0.03587441869544511 

PosDis of group 7: 0.0408142743240391 

PosDis of group 8: 0.05296060236158716 

PosDis of group 9: 0.06377648976638772 

PosDis of group 10: 0.07729962797275673 

PosDis of group 11: 0.02061233336838097 

PosDis of group 12: 0.09102514279404422 

