In [None]:
# Install necessary libraries
!pip install networkx pandas tqdm



In [None]:
import os
import gzip
import shutil

# Define URLs and file paths
CONCEPTNET_URL = "https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz"
LOCAL_GZ_FILE = "conceptnet-assertions-5.7.0.csv.gz"
LOCAL_CSV_FILE = "conceptnet-assertions-5.7.0.csv"

# Download ConceptNet CSV data if not already downloaded
if not os.path.exists(LOCAL_GZ_FILE):
    print("Downloading ConceptNet CSV data...")
    !wget -O {LOCAL_GZ_FILE} {CONCEPTNET_URL}
else:
    print("ConceptNet CSV data already downloaded.")

# Extract the CSV file if not already extracted
if not os.path.exists(LOCAL_CSV_FILE):
    print("Extracting CSV data...")
    with gzip.open(LOCAL_GZ_FILE, 'rb') as f_in:
        with open(LOCAL_CSV_FILE, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("Extraction complete.")
else:
    print("CSV data already extracted.")

Downloading ConceptNet CSV data...
--2024-12-08 04:31:34--  https://s3.amazonaws.com/conceptnet/downloads/2019/edges/conceptnet-assertions-5.7.0.csv.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 16.15.176.190, 54.231.163.144, 3.5.9.104, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|16.15.176.190|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 497963447 (475M) [application/x-gzip]
Saving to: ‘conceptnet-assertions-5.7.0.csv.gz’


2024-12-08 04:31:48 (34.3 MB/s) - ‘conceptnet-assertions-5.7.0.csv.gz’ saved [497963447/497963447]

Extracting CSV data...
Extraction complete.


In [None]:
import pandas as pd

# Read the first few lines to inspect column names
print("\nReading the first few lines of the CSV to inspect column names...")
try:
    # Read with tab separator and no header
    sample_df = pd.read_csv(LOCAL_CSV_FILE, nrows=5, sep='\t', header=None, encoding='utf-8')
    print("Columns found in the CSV:")
    print(sample_df.columns.tolist())
    print("\nSample data:")
    print(sample_df)
except Exception as e:
    print(f"Error reading CSV: {e}")


Reading the first few lines of the CSV to inspect column names...
Columns found in the CSV:
[0, 1, 2, 3, 4]

Sample data:
                                                   0           1  \
0      /a/[/r/Antonym/,/c/ab/агыруа/n/,/c/ab/аҧсуа/]  /r/Antonym   
1  /a/[/r/Antonym/,/c/adx/thəχ_kwo/a/,/c/adx/ʂap_...  /r/Antonym   
2    /a/[/r/Antonym/,/c/adx/tok_po/a/,/c/adx/ʂa_wə/]  /r/Antonym   
3    /a/[/r/Antonym/,/c/adx/ʂa_wə/a/,/c/adx/tok_po/]  /r/Antonym   
4  /a/[/r/Antonym/,/c/adx/ʂap_wə/a/,/c/adx/thəχ_k...  /r/Antonym   

                   2                3  \
0     /c/ab/агыруа/n      /c/ab/аҧсуа   
1  /c/adx/thəχ_kwo/a    /c/adx/ʂap_wə   
2    /c/adx/tok_po/a     /c/adx/ʂa_wə   
3     /c/adx/ʂa_wə/a    /c/adx/tok_po   
4    /c/adx/ʂap_wə/a  /c/adx/thəχ_kwo   

                                                   4  
0  {"dataset": "/d/wiktionary/en", "license": "cc...  
1  {"dataset": "/d/wiktionary/fr", "license": "cc...  
2  {"dataset": "/d/wiktionary/fr", "license": "cc...  
3

In [None]:
import math
from tqdm import tqdm
import pandas as pd

In [None]:
from tqdm import tqdm

# Define filter criteria
# SELECTED_RELATIONS = {"/r/FormOf", "/r/HasA", "/r/IsA", "/r/Causes", "/r/DerivedFrom","/r/Synonym", "/r/MadeOf", "/r/SimilarTo", "/r/CreatedBy", "/r/CapableOf", "/r/UsedFor", }
LANGUAGE_PREFIX = "/c/en/"
SELECTED_RELATIONS = {"/r/RelatedTo", "/r/FormOf", "/r/Synonym"}
# Define maximum number of filtered edges
MAX_FILTERED_EDGES = float('inf')  # Adjust based on your needs

def filter_conceptnet_csv(input_csv, output_csv, selected_relations, language_prefix, max_edges):
    """
    Filters the ConceptNet CSV data based on selected relations and language.
    Saves the filtered edges to a new CSV file.

    Parameters:
    - input_csv (str): Path to the input CSV file.
    - output_csv (str): Path to save the filtered CSV file.
    - selected_relations (set): Set of relations to include.
    - language_prefix (str): Prefix indicating the language (e.g., '/c/en/').
    - max_edges (int or float): Maximum number of edges to include. Use float('inf') for no limit.
    """
    # Initialize counters and storage
    count = 0
    filtered_rows = []

    # Read CSV in chunks to handle large files efficiently
    chunksize = 1000000  # Adjust chunk size as needed
    print("Filtering ConceptNet CSV data...")
    try:
        # Iterate through each chunk with a progress bar
        for chunk in tqdm(pd.read_csv(
            input_csv,
            sep='\t',
            header=None,
            names=['@id', 'rel', 'start', 'end', 'metadata'],
            chunksize=chunksize,
            on_bad_lines='skip',
            low_memory=False,
            encoding='utf-8'
        ), desc="Processing Chunks"):

            # Apply filtering criteria
            mask = (
                chunk['rel'].isin(selected_relations) &
                chunk['start'].str.startswith(language_prefix) &
                chunk['end'].str.startswith(language_prefix)
            )
            filtered = chunk[mask]
            filtered_rows.append(filtered)
            count += len(filtered)

            # Check if the maximum number of edges has been reached
            if count >= max_edges:
                break
    except Exception as e:
        print(f"Error during filtering: {e}")

    # Concatenate all filtered chunks
    if filtered_rows:
        filtered_data = pd.concat(filtered_rows)

        # If max_edges is finite, limit the DataFrame
        if not math.isinf(max_edges):
            filtered_data = filtered_data.head(max_edges)
    else:
        filtered_data = pd.DataFrame(columns=['@id', 'rel', 'start', 'end', 'metadata'])

    # Save the filtered data to the output CSV
    filtered_data.to_csv(output_csv, index=False)
    print(f"Filtered {len(filtered_data)} edges into {output_csv}")

# Define file paths
FILTERED_CSV_FILE = "conceptnet-lite.csv"

# Run the filtering
filter_conceptnet_csv(
    input_csv=LOCAL_CSV_FILE,
    output_csv=FILTERED_CSV_FILE,
    selected_relations=SELECTED_RELATIONS,
    language_prefix=LANGUAGE_PREFIX,
    max_edges=MAX_FILTERED_EDGES
)

Filtering ConceptNet CSV data...


Processing Chunks: 35it [05:04,  8.70s/it]


Filtered 2304597 edges into conceptnet-lite.csv


In [None]:
# Install necessary libraries
!pip install networkx pandas tqdm



In [None]:
import networkx as nx
import pandas as pd
from tqdm import tqdm

In [None]:
import pandas as pd
import networkx as nx
from tqdm import tqdm

def canonicalize_node(uri):
    """
    Canonicalizes a ConceptNet URI by removing parts after the lemma.
    For example:
    "/c/en/condiment/n" -> "/c/en/condiment"
    "/c/en/condiments/n" -> "/c/en/condiments"
    "/c/en/apple" stays "/c/en/apple"
    """
    parts = uri.strip('/').split('/')
    # We expect something like ["c", "en", "condiment", "n"]
    # Keep only the first three segments: c, language code, and the lemma
    if len(parts) >= 3:
        return "/" + "/".join(parts[:3])
    else:
        # If the URI doesn't follow the expected pattern, just return it as is.
        return uri

def load_conceptnet_lite_csv(file_path):
    """
    Loads the filtered ConceptNet Lite CSV into a NetworkX graph.

    Parameters:
    - file_path (str): Path to the filtered CSV file.

    Returns:
    - G (networkx.Graph): The constructed undirected graph with canonicalized nodes.
    """
    print("Loading ConceptNet Lite into NetworkX...")
    G = nx.Graph()

    try:
        # Read the CSV using pandas
        df = pd.read_csv(file_path, sep=',', encoding='utf-8')
    except Exception as e:
        print(f"Error reading filtered CSV: {e}")
        return G

    # Iterate through the DataFrame and add edges
    print("Adding edges to NetworkX...")
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Adding edges"):
        start = canonicalize_node(row['start'])
        end = canonicalize_node(row['end'])
        rel = row['rel']
        G.add_edge(start, end, relation=rel)

    print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
    return G


In [None]:
# Define the path to the filtered CSV
FILTERED_CSV_FILE = "conceptnet-lite.csv"

# Load the graph
G = load_conceptnet_lite_csv(FILTERED_CSV_FILE)

Loading ConceptNet Lite into NetworkX...
Adding edges to NetworkX...


Adding edges: 100%|██████████| 2304597/2304597 [04:32<00:00, 8453.51it/s]


Graph has 948526 nodes and 2052758 edges.


In [None]:
# Convert directed graph to undirected
if isinstance(G, nx.DiGraph):
    print("Converting directed graph to undirected...")
    G_undirected = G.to_undirected()
    print("Conversion complete.")

    # Optionally, replace the original graph with the undirected one
    G = G_undirected
else:
    print("The graph is already undirected.")

The graph is already undirected.


In [None]:
from collections import deque

def multi_source_bfs(G, start_nodes, max_distance=3):
    """
    Perform BFS from multiple start nodes up to max_distance.

    Parameters:
    - G (networkx.Graph): The graph to traverse.
    - start_nodes (list): List of node identifiers to start BFS from.
    - max_distance (int): Maximum distance to traverse.

    Returns:
    - distance (dict): Dictionary mapping nodes to their shortest distance from any start node.
    """
    distance = {}
    queue = deque()

    # Initialize the queue and distance dictionary with start nodes
    for node in start_nodes:
        if node in G:
            distance[node] = 0
            queue.append((node, 0))
        else:
            print(f"Warning: Start node {node} not in graph.")

    while queue:
        current_node, current_distance = queue.popleft()

        if current_distance >= max_distance:
            continue

        for neighbor in G.neighbors(current_node):
            if neighbor not in distance:
                distance[neighbor] = current_distance + 1
                queue.append((neighbor, current_distance + 1))

    return distance

In [None]:
# Define start concepts
start_concepts = ["ice_cream"]  # Modify this list as needed

# Convert start concepts to ConceptNet URIs
start_uris = [f"/c/en/{concept.lower()}" for concept in start_concepts]

In [None]:
# Define the maximum distance for BFS
max_distance = 5  # Adjust as needed

# Perform BFS
dist_map = multi_source_bfs(G, start_uris, max_distance)

In [None]:
# prompt: use grep to find out if "/c/en/condiments/" is in /content/conceptnet-lite.csv and print the lines if it is

!grep "/c/en/dietetic/" /content/conceptnet-lite.csv

"/a/[/r/RelatedTo/,/c/en/dietetic/a/,/c/en/diet/]",/r/RelatedTo,/c/en/dietetic/a,/c/en/diet,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc:by-sa/4.0"", ""sources"": [{""contributor"": ""/s/resource/wiktionary/en"", ""process"": ""/s/process/wikiparsec/2""}], ""weight"": 1.0}"
"/a/[/r/RelatedTo/,/c/en/dietic/a/,/c/en/dietetic/]",/r/RelatedTo,/c/en/dietic/a,/c/en/dietetic,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc:by-sa/4.0"", ""sources"": [{""contributor"": ""/s/resource/wiktionary/en"", ""process"": ""/s/process/wikiparsec/2""}], ""weight"": 1.0}"
"/a/[/r/RelatedTo/,/c/en/dietical/a/,/c/en/dietetic/]",/r/RelatedTo,/c/en/dietical/a,/c/en/dietetic,"{""dataset"": ""/d/wiktionary/en"", ""license"": ""cc:by-sa/4.0"", ""sources"": [{""contributor"": ""/s/resource/wiktionary/en"", ""process"": ""/s/process/wikiparsec/2""}], ""weight"": 1.0}"
"/a/[/r/RelatedTo/,/c/en/grahamism/n/,/c/en/dietetic/]",/r/RelatedTo,/c/en/grahamism/n,/c/en/dietetic,"{""dataset"": ""/d/wiktionary/

In [None]:
from collections import defaultdict

# Create a dictionary to count nodes at each distance
distance_counts = defaultdict(int)
for node, dist in dist_map.items():
    distance_counts[dist] += 1

print("\n=== Distance Summary ===")
for d in sorted(distance_counts):
    print(f"Distance {d}: {distance_counts[d]} nodes")


=== Distance Summary ===
Distance 0: 1 nodes
Distance 1: 95 nodes
Distance 2: 3794 nodes
Distance 3: 129606 nodes
Distance 4: 402645 nodes
Distance 5: 256817 nodes


In [None]:
def get_distance(node, dist_map):
    """
    Retrieve the distance of a node from the start set.

    Parameters:
    - node (str): The ConceptNet URI of the node.
    - dist_map (dict): The distance mapping obtained from BFS.

    Returns:
    - distance (int or float): The distance to the node, or float('inf') if unreachable.
    """
    return dist_map.get(node, float('inf'))

In [None]:
# Example: Check distance to 'orchard'
# orchard_uri = "/c/en/wine"
# orchard_distance = get_distance(orchard_uri, dist_map)
# print(f"\nDistance from start set to 'orchard': {orchard_distance if orchard_distance != float('inf') else 'inf'}")

# Example: Check distances for multiple nodes
nodes_to_check = ["/c/en/apple", "/c/en/egg", "/c/en/meat", "/c/en/dairy", "/c/en/milk", "/c/en/sausages" ]
print("\n=== Specific Node Distances ===")
for node in nodes_to_check:
    dist = get_distance(node, dist_map)
    print(f"Distance to '{node}': {dist if dist != float('inf') else 'inf'}")


=== Specific Node Distances ===
Distance to '/c/en/apple': 3
Distance to '/c/en/egg': 2
Distance to '/c/en/meat': 3
Distance to '/c/en/dairy': 1
Distance to '/c/en/milk': 2
Distance to '/c/en/sausages': 4


In [None]:
!cp /content/conceptnet-lite[RelatedTo,FormOf,Synonym].csv /content/drive/MyDrive/NeSym