In [1]:
# import required libs
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import hypernetx as hnx
import networkx as nx
import pickle
import pandas as pd
from time import time
from csv import reader, writer
from copy import deepcopy

## Load in data

In [2]:
# load in the word data (small world of words dataset)
with open('data/4plets_filtered.csv', 'r', encoding='utf-8') as f:
    csv_reader = reader(f)
    word_data = list(csv_reader)

In [3]:
# load in the age of acquisition data
with open('data/aoa.csv', 'r', encoding='utf-8') as f:
    csv_reader = reader(f)
    age_data = list(csv_reader)

In [4]:
# load in ordered list of words, ranked by proportion of kids who could reproduce that word
with open('data/ordering.txt','r') as f:
    csv_reader = reader(f)
    ordering_raw = list(csv_reader)

## Filter word associations to only words that appear in the ranked words list

In [6]:
#unpack ordering.txt list
ordering = []
for word in ordering_raw:
    ordering.append(word[0])

In [7]:
word_data_filtered = []
a=time()
for row in word_data:
    
    #in_list = any([word in ordering for word in row])
    if len(list(set(row) & set(ordering))) == 4:
        word_data_filtered.append(row)
print(time()-a)

6.256263732910156


In [96]:
len(word_data_filtered)

2649

In [10]:
# save word associations
#with open('./pickle/word_data_ordering.p','wb') as f:
#    pickle.dump(word_data_filtered,f)

## Build pairwise graph

In [97]:
def edge_convert(e):
    # return list of edge tuples such that the cue word is the centre of connection
    # e.g. edge_convert(['color', 'yellow', 'red', 'blue']) -> [('color', 'yellow'), ('color', 'red'), ('color', 'blue')]
    ret = [(e[0],i) for i in e[1:]]
    return ret

In [98]:
#compile a list of pairwise edges
edges = []
for r in word_data_filtered:
    edges += edge_convert(r)

In [101]:
#build graph with the pairwise edge connections
GT = nx.Graph(edges)

In [102]:
graph_degrees = dict(GT.degree(GT.nodes))

In [104]:
graph_closeness = nx.closeness_centrality(GT)

## Build hypergraph
Note: This may take some time to process

In [11]:
word_data_split = []

In [12]:
for idx, word_rows in enumerate(word_data_filtered):
    word_data_split += [[idx,w] for w in word_rows]

In [32]:
word_data_frame = pd.DataFrame(word_data_split)

In [42]:
# construct hypergraph
HT = hnx.Hypergraph(word_data_frame)

## Calculate degrees and s-closeness
Note: calculating the centralities takes about 30 seconds on my computer. This may take longer depending on computer resources.

In [43]:
hypnodes = list(HT.nodes)

In [45]:
hgraph_degrees = dict()
for node in hypnodes:
    hgraph_degrees[node] = HT.degree(node, s=1)

In [51]:
cent1 = hnx.s_closeness_centrality(hyptest,edges=False,s=1)
cent2 = hnx.s_closeness_centrality(hyptest,edges=False,s=2)
cent3 = hnx.s_closeness_centrality(hyptest,edges=False,s=3)

20.267813205718994


# Construct dataframes

In [89]:
# add age of acquisition data to a pandas dataframe
age_data_df = pd.DataFrame(data=age_data, columns=['word','logfreq','age'])

# convert logfreq and age from strings to floats
age_data_df[["logfreq", "age"]] = age_data_df[["logfreq", "age"]].apply(pd.to_numeric)

In [92]:
# convert to dictionaries for easy mapping
logfreq_dict = pd.Series(age_data_df.logfreq.values,index=age_data_df.word).to_dict()
age_dict = pd.Series(age_data_df.age.values,index=age_data_df.word).to_dict()

In [105]:
# extract list of words that appear in the hypergraph, retaining the original ordering given in the ranked words set
words = []
for word in ordering:
    if word in hypnodes:
        words.append(word)

## (pairwise graph)

In [106]:
gdata = pd.DataFrame()

In [107]:
# add all words that are in the graph to a dataframe
gdata['word'] = words

In [108]:
gdata['degree'] = gdata['word'].map(graph_degrees)

In [109]:
gdata['centrality'] = hdata['word'].map(graph_closeness)

In [7]:
gdata['rank'] = list(range(498))[1:]

In [110]:
# map age and logfreq into table
gdata['logfreq'] = gdata['word'].map(logfreq_dict)
gdata['age'] = gdata['word'].map(age_dict)

In [11]:
word_lens = {}
for word in gdata['word']:
    word_lens[word] = len(word)

In [13]:
gdata['len'] = gdata['word'].map(word_lens) 

In [15]:
#gdata.to_pickle('./pickle/graph_data_ranking.p')

## (hypergraph)

In [113]:
hdata = pd.DataFrame()

In [114]:
# add all words that are in the hypergraph to a dataframe
hdata['word'] = words

In [115]:
hdata['degree'] = hdata['word'].map(hgraph_degrees)

In [116]:
hdata['centrality1'] = hdata['word'].map(cent1)
hdata['centrality2'] = hdata['word'].map(cent2)
hdata['centrality3'] = hdata['word'].map(cent3)

In [117]:
hdata['rank'] = list(range(498))[1:]

In [118]:
# map age and logfreq into table
hdata['logfreq'] = hdata['word'].map(logfreq_dict)
hdata['age'] = hdata['word'].map(age_dict)

In [19]:
word_lens = {}
for word in hdata['word']:
    word_lens[word] = len(word)

In [23]:
hdata['len'] = hdata['word'].map(word_lens) 

In [22]:
#hdata.to_pickle('./pickle/hyp_data_ranking.p')