# To use:
Execute the blocks below to import required modules and load the word association data, and age of acquisition data.

In [29]:
# import required libs
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import networkx as nx
import pickle
import pandas as pd
from time import time
from copy import copy
from csv import reader

In [30]:
# load in the word data (small world of words dataset)
with open('data/4plets_filtered.csv', 'r', encoding='utf-8') as f:
    csv_reader = reader(f)
    word_data = list(csv_reader)

In [31]:
# load in the age of acquisition data
with open('data/aoa.csv', 'r', encoding='utf-8') as f:
    csv_reader = reader(f)
    age_data = list(csv_reader)

## Extract list of unique words from word_data (SWoW)

In [32]:
# add every word from every row to a list
words_swow = []
for row in word_data:
    words_swow += row
# take the set of the list to find every unique word in the SWOW dataset
words_swow = set(words_swow)

## Filter word data to age<=9

In [33]:
# add age of acquisition data to a pandas dataframe
gdata = pd.DataFrame(data=age_data, columns=['word','logfreq','age'])

# convert logfreq and age from strings to floats
gdata[["logfreq", "age"]] = gdata[["logfreq", "age"]].apply(pd.to_numeric)

In [34]:
#filter out all words learnt after age 9
gdata_9 = gdata[gdata['age']<=9]

In [35]:
# extract a list of words included in the 'age of acquisition' (AOA) dataset
words_aoa = set(gdata_9['word'])

In [36]:
#take cross section of words from swow dataset and words from the AOA dataset
words = words_swow & words_aoa

In [42]:
# remove words from the dataframe which dont appear in the word association data
for w in gdata_9['word']:
    if w not in words:
        gdata_9.drop(gdata_9.loc[gdata_9['word']==w].index, inplace=True)

The block below removes all rows from the word association data that contain words that are NOT included in the AOA dataset.  
i.e. it removes words which will not have age data associated.  
**!!This block can take some considerable time to run (>45 minutes)!!**  
To mitigate this, execute the block under the heading "**Load word association Data**".

In [10]:
a=time()
# remove rows from word association data
word_data_copy = copy(word_data)
for r in word_data_copy:
    if len(set(r) & words) != 4:
        word_data.remove(r)
print(time()-a)

KeyboardInterrupt: 

In [38]:
# print number of 4-word association rows
print(len(word_data))

543792


## Save word association data
(in markup block so no important data is overwritten)

`with open('./pickle/word_data_aoa_9.p','wb') as f:
    pickle.dump(word_data,f)`

## Load word association data

In [44]:
word_data_9 = pickle.load(open('./pickle/word_data_aoa_9.p','rb'))

## Construct pairwise graph
And calculate degrees + centralities

In [45]:
def edge_convert(e):
    # return list of edge tuples such that the cue word is the centre of connection
    # e.g. edge_convert(['color', 'yellow', 'red', 'blue']) -> [('color', 'yellow'), ('color', 'red'), ('color', 'blue')]
    ret = [(e[0],i) for i in e[1:]]
    return ret

In [46]:
#compile a list of pairwise edges
edges = []
for r in word_data:
    edges += edge_convert(r)

In [47]:
#build graph with the pairwise edge connections
GT = nx.Graph(edges)

In [48]:
graph_degrees = dict(GT.degree(GT.nodes))

In [49]:
# remove words from the dataframe which dont appear in the graph
word_list = gdata_9['word']
for w in word_list:
    if w not in GT.nodes:
        gdata_9.drop(gdata_9.loc[gdata_9['word']==w].index, inplace=True)

## Arrange Data

In [51]:
gdata_9['degree'] = gdata_9['word'].map(graph_degrees) # add degree to df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdata_9['degree'] = gdata_9['word'].map(graph_degrees) # add degree to df


In [52]:
word_lens = {}
for word in words:
    word_lens[word] = len(word)

In [53]:
gdata_9['len'] = gdata_9['word'].map(word_lens) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdata_9['len'] = gdata_9['word'].map(word_lens)


In [54]:
gdata_9 = gdata_9.dropna()
gdata_9.reset_index(inplace=True)

In [136]:
# save gdata to pickle file for use in ml models
# gdata_9.to_pickle('./pickle/gdata_aoa.p')