# Summary Of the Different Algorithms Developed

We have created different algorithms for a "machine" to emulate the behavior of humans, 3 in total.

In this notebook we provide the code for each of them

## Import Libraries and Useful Functions used Among All Algorithms

In [1]:
import scipy
import numpy as np

import sys
import os
import pandas as pd
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import time
import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)

from data_readers import *

import machine_searchers
import time

In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

### Read Links Graph

In [3]:
G = read_wikispeedia_graph()
pagerank = nx.pagerank(G)

### Read Finished Paths File

In [4]:
finished_paths = read_finished_paths()
unique_paths = finished_paths[['first_article', 'last_article']].drop_duplicates()
sources = unique_paths['first_article']
targets = unique_paths['last_article']
unique_paths.sort_values(by=['first_article', 'last_article'], inplace=True)
unique_paths.reset_index(inplace=True, drop=True)

df = finished_paths[['first_article', 'last_article']].copy()
df['path'] = df['first_article'] + '_' + df['last_article']
df['path_count'] = df.groupby('path')['path'].transform('count')
df.drop_duplicates(subset='path', inplace=True)
df = df.sort_values('path_count', ascending = False)
df = df[df['path_count']>2][['first_article', 'last_article', 'path_count']]
df.reset_index(drop=True, inplace=True)
df.to_csv('paths_sample.csv')
df.head()

Unnamed: 0,first_article,last_article,path_count
0,Asteroid,Viking,1043
1,Brain,Telephone,1040
2,Theatre,Zebra,905
3,Pyramid,Bean,642
4,Batman,Wood,148


# 1. Algorithm Carlos (DELETE THIS AND THINK OF A COOL NAME TO REFER TO IT) 

1. **Starting Point**
   - Begin at the initial Wikipedia article.
<br><br><br>

2. **Evaluation of Links**
   - Analyze links in the current article.
<br><br><br>

3. **PageRank and Topic Similarity Assessment**
   - Prioritize links with the highest PageRank if topics of linked pages are not similar to the target article.
   - Make a balance between PageRank and topical similarity if topics are somewhat similar.
   - Increase focus on topical similarity as it becomes more similar to the target.
<br><br><br>

4. **Decision to Move Forward or Backward**
   - Return to the previous page if all new links are less promising than the previous page (worse in PageRank and similarity).
   - If not returning, avoid revisiting the same page to prevent loops.
<br><br><br>

5. **Limit on Page Visits**
   - Stop if 20 different pages are visited without reaching the target article.
<br><br><br>

6. **End Goal**
   - Continue the process until the target Wikipedia article is reached or the page visit limit is hit.


In [6]:
def get_value(G, node_value, target_value):
    """
    Calculate a value for a node based on its semantic similarity to the target and its PageRank.

    Parameters:
    G (networkx.Graph): The graph the node is part of.
    node_value (str): The value of the current node.
    target_value (str): The value of the target node.

    Returns:
    float: A calculated value for the node.
    """
    # Calculate semantic similarity between the node and the target
    similarity = semantic_similarity(node_value, target_value)

    # Get PageRank of the node in graph G
    node_pagerank = pagerank.get(node_value, None)

    # Calculate the final value based on similarity and PageRank
    if similarity < 0.1:
        f = node_pagerank
    elif 0.1 <= similarity <= 0.5:
        f = similarity * node_pagerank
    else:
        f = similarity
    return f

def ai_1(graph, start_node, target_node):
    """
    Algorithm to find a path in a graph from start_node to target_node.

    Parameters:
    graph (networkx.Graph): The graph to traverse.
    start_node (str): The starting node in the graph.
    target_node (str): The target node to reach in the graph.

    Returns:
    tuple: A tuple containing the number of moves, the visited nodes list, and a flag indicating if the target was reached.
    """
    # Initialize the starting node and visited nodes list
    current_node = start_node
    visited = []  # List to keep track of visited nodes
    previous_node = start_node
    reached_target = False
    print(f"Starting at node: {current_node}")

    # Iterate up to a maximum of 20 moves
    for length in range(20):
        # Check if the current node is the target
        if current_node == target_node:
            print(f"Target node reached in {length} moves.")
            visited.append(previous_node)
            visited.append(current_node)
            reached_target = True
            return length+1, visited, reached_target

        # Mark the previous node as visited (except for the first move)
        if length != 0:
            visited.append(previous_node)
        
        # Update the previous node
        previous_node = current_node

        # Get unvisited neighbors of the current node
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited and n != current_node]

        # Choose the next node based on calculated value
        if unvisited_neighbors:
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            print(f"Moving to node: {current_node}")
        else:
            # Exit if there are no unvisited neighbors
            print("No more unvisited neighbors to move to.")
            return length+1, visited, reached_target

    # If the loop exits due to reaching the move limit
    print("Limit of 20 nodes reached.")
    return length+1, visited, reached_target

# Note: The function `semantic_similarity` and the variable `pagerank` need to be defined.


In [7]:
ai_1(G, 'Zebra', 'Bean');

Starting at node: Zebra
Moving to node: Animal
Moving to node: Latin
Moving to node: United_States
Moving to node: Time_zone
Moving to node: France
Moving to node: List_of_countries_by_system_of_government
Moving to node: People%27s_Republic_of_China
Moving to node: English_language
Moving to node: German_language
Moving to node: United_Kingdom
Moving to node: India
Moving to node: Japan
Moving to node: Vegetable
Moving to node: Bean
Target node reached in 14 moves.


# 2. Algorithm Carol (DELETE THIS AND THINK OF A COOL NAME TO REFER TO IT) 

1. **Starting Point**
   - Begin at the initial node in the network.
   <br><br><br>

2. **Exploring Connections**
   - Examine the connections (successors) of the current node.
   <br><br><br>

3. **Semantic Similarity and PageRank Assessment**
   - For each connection, calculate its semantic similarity to the target node.
   - Obtain the PageRank for each connection.
   - Store and compare these values for decision-making.
   <br><br><br>

4. **Choosing the Next Node**
   - If a connection's semantic similarity is above a reference threshold, prioritize moving to the node with the highest similarity.
   - Otherwise, move to the node with the highest PageRank.
   - Skip nodes that have already been visited to prevent loops.
   <br><br><br>

5. **Random Selection as a Fallback**
   - If there are no suitable nodes based on similarity or PageRank, choose a random successor to move to.
   <br><br><br>

6. **Limit on Node Visits**
   - Terminate the process if 25 nodes are visited without reaching the target.
   <br><br><br>

7. **End Goal**
   - Continue the process until the target node is reached or the node visit limit is hit.
   <br><br><br>

In [8]:
def ai_2(G: nx.Graph, source: str, target: str, ref_similarity=0.3):
   # Initialize visited nodes set, children lists, and path
   visited = set([])
   current_children = []
   sem_sim_childr = {}
   max_page_childr = {}
   path = []

   # Set the current node to the source
   current_node = source 

   # Flag to check if target is found
   found = False

   print(f"Starting at node: {current_node}")

   # Loop until the target is found or limit is reached
   while not found:
      # Mark the current node as visited and add to the path
      visited.add(current_node)
      path.append(current_node)
   
      # Check if the path length limit is reached
      if len(path) >= 25:
         print("Limit of 25 nodes reached.")
         return source, target, found, len(path), path

      # Check if the target is reached
      if current_node == target:
         found = True
         print(f"Moving to node: {current_node}")
         print(f"Target node reached in {len(path)} moves.")
         return source, target, found, len(path), path,

      # Get the children (successors) of the current node
      current_children = list(G.successors(current_node))
      
      # Reset the dictionaries for storing similarities and pageranks
      sem_sim_childr = {}
      max_page_childr = {}

      # Iterate over children to calculate similarities and pageranks
      for c in current_children:
         # Check if the child is the target
         if c == target:
               found = True
               visited.add(c)
               path.append(c)
               print(f"Moving to node: {c}")
               print(f"Target node reached in {len(path)} moves.")
               return source, target, found, len(path), path,

         # Skip visited nodes
         elif c in visited:
               current_children.remove(c)
         else:
               # Compute semantic similarity
               semsim = semantic_similarity(c, target)
               sem_sim_childr[c] = semsim

               # Compute pagerank
               #pagerank = G.nodes[c]['pagerank']
               max_page_childr[c] = pagerank.get(c, None)

      # Choose the next node based on similarity or pagerank
      if sem_sim_childr:
         # Get the node with the maximum similarity
         max_node = max(sem_sim_childr, key=sem_sim_childr.get)
         max_sim = sem_sim_childr[max_node]
         if max_sim >= ref_similarity:
               # Move to the node with the highest similarity
               current_node = max_node
         else:
               # Move to the node with the highest pagerank
               max_node = max(max_page_childr, key=max_page_childr.get)
               current_node = max_node
      else:
         # Choose a random successor if no suitable node is found
         current_children = list(G.successors(current_node))
         current_node = random.choice(current_children)
   
      print(f"Moving to node: {current_node}")

# Note: The function `semantic_similarity` needs to be defined.


In [9]:
ai_2(G, 'Zebra', 'Bean');

Starting at node: Zebra
Moving to node: Lion
Moving to node: Tiger
Moving to node: Bear
Moving to node: Forest
Moving to node: Tree
Moving to node: Pea
Moving to node: Bean
Target node reached in 8 moves.


## Main Differences Between Algorithms 1 and 2

Both algorithms navigate through networks (like Wikipedia articles) using PageRank and semantic similarity, but they have distinct approaches:

### AI #1: The Wide Searcher
- **Primary Focus**: Prioritizes PageRank initially, shifting towards semantic similarity as it becomes more relevant.
- **Backtracking Mechanism**: Can backtrack to the previous node if new connections are less promising.
- **Node Visit Limit**: Stops after exploring 20 nodes to prevent extensive wandering.
- **Usage Scenario**: Gives more importance to the semantic similarity.

### AI #2: The Dynamic Navigator
- **Balanced Approach**: Dynamically balances between PageRank and semantic similarity from the start.
- **Loop Prevention**: Avoids revisiting nodes to prevent loops, without backtracking.
- **Extended Exploration**: Allows for exploration up to 25 nodes, providing a more extended search.
- **Usage Scenario**: Explores nodes with high PageRank until the similarity is high.

The key difference lies in their threshold for when to start focusing on similarity. AI #1 starts already looking at similarity for similarities > 0.1, while AI #2 starts looking at similarity when it is higher, >0.3. 


# 3. Algorithm Nico (DELETE THIS AND THINK OF A COOL NAME TO REFER TO IT) 

In [10]:
def modded_get_embedding(text: str):
    temp_str = text.replace('_', ' ')
    temp_str = unquote(temp_str)
    inputs = tokenizer(temp_str, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def distance_two_words(w1: str, w2: str):
    """Receives a string that was in the wikispeedia dataset, and transforms it as needed to work
    with the berd embeddings."""

    embedding1 = modded_get_embedding(w1)
    embedding2 = modded_get_embedding(w2)
    similarity = cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())[0][0]
    # Adding absolute, just in case it is needed
    # Similarity is actually 1 - abs(similarity) + 1,
    # As we want closer words to have a smaller distance
    # The last plus one is to indicate that there would be an extra cost to exploring, as if not the system often
    # thinks that there are nodes that have a distance of 0.5 or something like that
    similarity = 1 - abs(similarity) + 1
    # print("First word:", w1, ". Second word:", w2, ". GoodDistance:", similarity)
    return similarity

start_time = time.time()
lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=distance_two_words)
end_time = time.time()

# It's len - 1 because the target node is also included, and that node wasn't explored
print("Using the modded a star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_1)-1)
print(" Found it in:", end_time-start_time)

start_time = time.time()
lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=distance_two_words)
end_time = time.time()

# It's len - 1 because the target node is also included, and that node wasn't explored
print("Using depth first only A star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_1)-1)
print(" Found it in:", end_time-start_time)


NameError: name 'wikispeedia' is not defined

In [15]:
len(finished_paths['path'])

51318

In [19]:
terms = finished_paths['path'].str.split(';').explode()
term_counts = terms.value_counts().reset_index()
term_counts.columns = ['Term', 'Count']
term_counts = term_counts.drop(index=0)
term_counts

Unnamed: 0,Term,Count
1,United_States,8896
2,Europe,4362
3,United_Kingdom,3904
4,England,3332
5,Earth,3223
...,...,...
4165,Introduction_to_special_relativity,1
4166,Six-party_talks_concerning_North_Korea%27s_nuc...,1
4167,Anton%C3%ADn_Dvo%C5%99%C3%A1k,1
4168,Colditz_Castle,1


In [8]:
categories = read_categories()
categories

Unnamed: 0,article,categories
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists
...,...,...
5199,Zirconium,subject.Science.Chemistry.Chemical_elements
5200,Zoroaster,subject.People.Religious_figures_and_leaders
5201,Zuid-Gelders,subject.Geography.European_Geography
5202,Zuid-Gelders,subject.Language_and_literature.Languages


In [10]:
categories[categories['article'] == term]['categories'].values[0]

'subject.Geography.Peoples'

In [12]:
# Initialize a network graph
G = nx.Graph()

# Add nodes with counts and edges with weights
for path in finished_paths['path']:
    terms = path.split(';')

    # Add nodes and update count
    for term in terms:
        if G.has_node(term):
            G.nodes[term]['count'] += 1
        else:
            G.add_node(term, count=1)

    # Add edges and update weights
    for i in range(len(terms)):
        for j in range(i + 1, len(terms)):
            if G.has_edge(terms[i], terms[j]):
                G[terms[i]][terms[j]]['weight'] += 1
            else:
                G.add_edge(terms[i], terms[j], weight=1)

for node in G.nodes:
    # Find the category for the node
    category = categories[categories['article'] == node]['categories'].values[0] if not categories[categories['article'] == node].empty else 'Unknown'
    
    # Assign the category to the node
    G.nodes[node]['category'] = category

# Extracting node data with counts
nodes_with_counts = [(node, G.nodes[node]['count']) for node in G.nodes]

# Extracting edge data with weights
edges_with_weights = [(u, v, G[u][v]['weight']) for u, v in G.edges]

nodes_with_counts[:5], edges_with_weights[:5]  # Display the first few rows of nodes and edges data


([('14th_century', 154),
  ('15th_century', 187),
  ('16th_century', 225),
  ('Pacific_Ocean', 844),
  ('Atlantic_Ocean', 1324)],
 [('14th_century', '15th_century', 67),
  ('14th_century', '16th_century', 48),
  ('14th_century', 'Pacific_Ocean', 1),
  ('14th_century', 'Atlantic_Ocean', 4),
  ('14th_century', 'Accra', 1)])

In [6]:
edges_with_weights

[('14th_century', '15th_century', 67),
 ('14th_century', '16th_century', 48),
 ('14th_century', 'Pacific_Ocean', 1),
 ('14th_century', 'Atlantic_Ocean', 4),
 ('14th_century', 'Accra', 1),
 ('14th_century', 'Africa', 5),
 ('14th_century', 'Atlantic_slave_trade', 6),
 ('14th_century', 'African_slave_trade', 4),
 ('14th_century', 'Europe', 27),
 ('14th_century', 'Niger', 1),
 ('14th_century', 'Nigeria', 1),
 ('14th_century', 'British_Empire', 5),
 ('14th_century', 'Slavery', 2),
 ('14th_century', 'Renaissance', 17),
 ('14th_century', 'Ancient_Greece', 8),
 ('14th_century', 'Greece', 1),
 ('14th_century', 'Italy', 4),
 ('14th_century', 'Roman_Catholic_Church', 3),
 ('14th_century', 'HIV', 1),
 ('14th_century', 'Ronald_Reagan', 1),
 ('14th_century', 'President_of_the_United_States', 3),
 ('14th_century', 'John_F._Kennedy', 2),
 ('14th_century', 'North_America', 3),
 ('14th_century', 'United_States', 7),
 ('14th_century', 'China', 4),
 ('14th_century', 'Gunpowder', 2),
 ('14th_century', 'Fir