# Clean Concept Net
**Last Updated By:** Kyle Williams <br>
**Last Updated On:** 5/26/2023

**Motivation:** ConceptNet makes available a [pre-built list](https://github.com/commonsense/conceptnet5/wiki/Downloads) of all the edges in ConceptNet 5.7 in a tab-separated text file. This file is much too large (10.7 GB!) to store on disk. This code in this Notebook attempts to filter the ConceptNet csv file to contain only the edges we need, and reduce any unnecessary columns. 

In [1]:
'''
Necessary Imports
'''
import dask.dataframe as dd
import numpy as np
import pandas as pd
import json

In [2]:
'''
Load the DataFrame

The five fields of each line are:
The URI of the whole edge
The relation expressed by the edge
The node at the start of the edge
The node at the end of the edge
A JSON structure of additional information about the edge, such as its weight
'''
edges_df = dd.read_csv('conceptnet-assertions-5.7.0.csv', delimiter='\t')
edges_df.columns = ['uri', 'rltn', 'src', 'dst', 'json']
edges_df.head()

Unnamed: 0,uri,rltn,src,dst,json
0,"/a/[/r/Antonym/,/c/adx/thəχ_kwo/a/,/c/adx/ʂap_...",/r/Antonym,/c/adx/thəχ_kwo/a,/c/adx/ʂap_wə,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
1,"/a/[/r/Antonym/,/c/adx/tok_po/a/,/c/adx/ʂa_wə/]",/r/Antonym,/c/adx/tok_po/a,/c/adx/ʂa_wə,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
2,"/a/[/r/Antonym/,/c/adx/ʂa_wə/a/,/c/adx/tok_po/]",/r/Antonym,/c/adx/ʂa_wə/a,/c/adx/tok_po,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
3,"/a/[/r/Antonym/,/c/adx/ʂap_wə/a/,/c/adx/thəχ_k...",/r/Antonym,/c/adx/ʂap_wə/a,/c/adx/thəχ_kwo,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
4,"/a/[/r/Antonym/,/c/ae/𐬨𐬀𐬰𐬛𐬀𐬌𐬌𐬀𐬯𐬥𐬀/n/,/c/ae/𐬛𐬀𐬉...",/r/Antonym,/c/ae/𐬨𐬀𐬰𐬛𐬀𐬌𐬌𐬀𐬯𐬥𐬀/n,/c/ae/𐬛𐬀𐬉𐬎𐬎𐬀𐬌𐬌𐬀𐬯𐬥𐬀,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."


In [3]:
'''
Filter out edges containing non-english source nodes
'''
edges_df = edges_df.loc[edges_df.src.str.startswith("/c/en")]

In [4]:
'''
Filter out edges containing non-english destination nodes.
'''
edges_df = edges_df.loc[edges_df.dst.str.startswith("/c/en")].compute()

In [5]:
'''
Save our progress so I can remove the original, super large file and continue working
'''
edges_df.to_csv('conceptnet-english-edges-5.7.csv', index=False, sep='\t')

In [29]:
'''
Read the file and remove the uri column
'''
edges_en_df = dd.read_csv('conceptnet-english-edges-5.7.csv', sep='\t')
# edges_en_df = edges_en_df.drop(columns=['uri'])
edges_en_df.head()

Unnamed: 0,uri,rltn,src,dst,json
0,"/a/[/r/Antonym/,/c/en/0/n/,/c/en/1/]",/r/Antonym,/c/en/0/n,/c/en/1,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."
1,"/a/[/r/Antonym/,/c/en/12_hour_clock/n/,/c/en/2...",/r/Antonym,/c/en/12_hour_clock/n,/c/en/24_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
2,"/a/[/r/Antonym/,/c/en/24_hour_clock/n/,/c/en/1...",/r/Antonym,/c/en/24_hour_clock/n,/c/en/12_hour_clock,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
3,"/a/[/r/Antonym/,/c/en/5/n/,/c/en/3/]",/r/Antonym,/c/en/5/n,/c/en/3,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc..."
4,"/a/[/r/Antonym/,/c/en/a.c/n/,/c/en/d.c/]",/r/Antonym,/c/en/a.c/n,/c/en/d.c,"{""dataset"": ""/d/wiktionary/fr"", ""license"": ""cc..."


In [32]:
edge_depths = edges_en_df['uri'].apply(lambda x: x.count("/[/"), meta=('uri', 'int64')).compute()
edge_depths.value_counts()

1    3474746
Name: uri, dtype: int64

In [10]:
'''
Remove the text from the paths to the rltn, src, and dst columns to save space
'''
edges_en_df.rltn = edges_en_df.rltn.apply(lambda x: x.split('/')[-1], meta=('rltn', 'object'))
edges_en_df.src = edges_en_df.src.apply(lambda x: x.split('/')[-2], meta=('src', 'object'))
edges_en_df.dst = edges_en_df.dst.apply(lambda x: x.split('/')[-1], meta=('dst', 'object'))

In [88]:
edges_en_df

Unnamed: 0_level_0,rltn,src,dst,json
npartitions=13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,object,object
,...,...,...,...
...,...,...,...,...
,...,...,...,...
,...,...,...,...


In [93]:
'''
Save the cleaned file to disk. I was having a hard time deserializing the json column, so I'm
going to delete it from this and store it elsewhere. The only semi-useful column it has is weight
anyways.

I am sorting the dataframe and making 'src' it's index as well, since I'm guessing that's how we're most likely
going to query this. If we continue to use dask, it can set up index files and other database tricks to keep
accesses to this DataFrame fast. Hopefully this means we won't need to clean it too much further. 
'''
edges_en_df = edges_en_df.sort_values(['src', 'dst'], acsending=[True, True])
edges_en_df.compute().to_csv('conceptnet-assertions-5.7.0-en.csv', index=False, sep='\t')

In [121]:
edges_en_df = pd.read_csv('conceptnet-assertions-5.7.0-en.csv', sep='\t')

# Filter rows based on alphanumeric characters and underscores
pattern = r'^[a-zA-Z0-9_]+$'
edges_en_df = edges_en_df[edges_en_df['src'].str.match(pattern)]
edges_en_df = edges_en_df[edges_en_df['dst'].str.match(pattern)]

# Give a new id column so I can set the index to their tuple for fast indexing
edges_en_df['id'] = np.arange(edges_en_df.shape[0])

In [124]:
edges_en_df.set_index(['src', 'id']).to_csv('conceptnet-assertions-5.7.0-en.csv', index=True, sep='\t')

In [72]:
edges_out = dd.read_csv('conceptnet-assertions-5.7.0-en-out.csv/0.part', sep=';').set_index('src')
edges_in = dd.read_csv('conceptnet-assertions-5.7.0-en-in.csv/0.part', sep=';', dtype={'dst': 'object'}).set_index('dst')

edges_in = edges_in.drop(columns=['Unnamed: 0', 'id'])
edges_in = edges_in.drop_duplicates()
edges_out = edges_out.drop(columns=['Unnamed: 0', 'id'])
edges_out = edges_out.drop_duplicates()

In [74]:
edges_in = edges_in.loc[edges_in.src != 'en']
edges_in = edges_in.loc[edges_in.src != 'wn']
edges_in = edges_in.loc[edges_in.src != 'wikt']
edges_in = edges_in.loc[edges_in.src != 'a']
edges_in = edges_in.loc[edges_in.src != 'wp']
edges_in = edges_in.loc[edges_in.src != 'opencyc']
edges_in = edges_in.loc[edges_in.src != 'enm']
edges_in = edges_in.loc[edges_in.src != 'r']

In [75]:
edges_in.compute().to_csv('conceptnet-in-assertions-5.7.0-en.csv', index=True, sep=',')

In [76]:
edges_out = edges_out.loc[edges_out.dst != 'en']
edges_out = edges_out.loc[edges_out.dst != 'wn']
edges_out = edges_out.loc[edges_out.dst != 'wikt']
edges_out = edges_out.loc[edges_out.dst != 'a']
edges_out = edges_out.loc[edges_out.dst != 'wp']
edges_out = edges_out.loc[edges_out.dst != 'opencyc']
edges_out = edges_out.loc[edges_out.dst != 'enm']
edges_out = edges_out.loc[edges_out.dst != 'r']

In [77]:
edges_out.compute().to_csv('conceptnet-out-assertions-5.7.0-en.csv', index=True, sep=',')

In [62]:
dd.read_csv('conceptnet-out-assertions-5.7.0-en.csv', sep=',').head()

Unnamed: 0,id,rltn,dst
0,0,Antonym,1
1,1,HasContext,electrical_engineering
2,2,RelatedTo,low
3,3,RelatedTo,emergency_service
4,4,Synonym,james_bond


In [81]:
out = dd.read_csv('conceptnet-out-assertions-5.7.0-en.csv', sep=',').set_index('src')

In [82]:
out.head()

Unnamed: 0_level_0,rltn,dst
src,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Antonym,1
0,HasContext,electrical_engineering
0,RelatedTo,low
0,RelatedTo,emergency_service
7,Synonym,james_bond
