# Testing GraphSearch Distance Concepts

## Load Libraries

In [1]:
import numpy as np
import os
import sys
import json
import copy
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.srex.ranking import Ranking
from models.srex.vicinity_graph import VicinityGraph, VicinityNode
from utils.data_utils import DataUtils

## Initialize some variables

In [2]:
# Load Stop Words
stop_words_list = DataUtils.load_stopwords()

In [3]:
query                    = 'iot OR sensor'
nr_search_results        = 3
ranking_weight_type      = 'linear' # it can be: 'none', 'linear' or 'inverse'
lema                     = True
stem                     = False
summarize                = 'mean'   # it can be: 'mean' or 'median'
nr_of_graph_terms        = 5
limit_distance           = 4
include_query_terms      = False

## Calculate Ranking data structure

In [4]:
document_1 = 'In scientific research, sensors are considered as a prospective field for innovation.'
document_2 = 'IoT sensors are efficiently used in various IoT applications for creating a smart environment by collecting real time data.'
document_3 = 'Internet of Things (IoT) is revolutionizing our world with trillions of sensors and actuators by creating a smart environment around us.'

articles_list = [
    {'title': document_1, 'article_number': '1'}, 
    {'title': document_2, 'article_number': '2'}, 
    {'title': document_3, 'article_number': '3'}
]

ranking = Ranking(query, nr_search_results, ranking_weight_type, stop_words_list, lema, stem)
ranking.calculate_article_dictionaries_list(articles_list)

## Generate all graphs from ranking, documents and sentences

In [5]:
ranking.generate_all_graphs(nr_of_graph_terms, limit_distance, include_query_terms, summarize)

## Imprimir texto crudo y texto preprocesado de cada documento

In [6]:
for index, d in enumerate(ranking.get_documents()):
    print(f"Document {d.get_doc_id()}")
    print(f"raw text: {d.get_title()}")
    for s in d.get_sentences():
        print(f"preprocessed text: {s.get_preprocessed_text()}")
    print()

Document 1
raw text: In scientific research, sensors are considered as a prospective field for innovation.
preprocessed text: scientific research sensor considered prospective field innovation

Document 2
raw text: IoT sensors are efficiently used in various IoT applications for creating a smart environment by collecting real time data.
preprocessed text: iot sensor efficiently used various iot application creating smart environment collecting real time data

Document 3
raw text: Internet of Things (IoT) is revolutionizing our world with trillions of sensors and actuators by creating a smart environment around us.
preprocessed text: internet thing iot revolutionizing world trillion sensor actuator creating smart environment around



## Imprimir diccionario de posiciones de términos por cada documento

In [7]:
# Función personalizada para manejar el formato
def custom_json_format(data):
    formatted = "{\n"
    for key, value in data.items():
        # Convierte cada par clave-valor en una línea con el formato deseado
        formatted += f"  '{key}': {json.dumps(value)},\n"
    formatted = formatted.rstrip(",\n") + "\n}"  # Elimina la última coma y cierra el bloque
    return formatted


# Imprimir diccionario de posiciones de términos por cada documento
for d in ranking.get_documents():
    print(f"Document {d.get_doc_id()}")
    for s in d.get_sentences():
        term_positions_dict = s.get_term_positions_dict()
        query_terms = s.get_query_tree().get_query_terms_str_list_with_underscores()
        query_term_positions_dict = s.get_query_term_positions_dict(term_positions_dict, query_terms)
        
        print('\nPosiciones sintácticas de términos')
        print(custom_json_format(term_positions_dict))
        print('\nPosiciones sintácticas de query terms')
        print(f"{custom_json_format(query_term_positions_dict)}\n")
    print()

Document 1

Posiciones sintácticas de términos
{
  'scientific': [0],
  'research': [1],
  'sensor': [2],
  'considered': [3],
  'prospective': [4],
  'field': [5],
  'innovation': [6]
}

Posiciones sintácticas de query terms
{
  'sensor': [2]
}


Document 2

Posiciones sintácticas de términos
{
  'iot': [0, 5],
  'sensor': [1],
  'efficiently': [2],
  'used': [3],
  'various': [4],
  'application': [6],
  'creating': [7],
  'smart': [8],
  'environment': [9],
  'collecting': [10],
  'real': [11],
  'time': [12],
  'data': [13]
}

Posiciones sintácticas de query terms
{
  'iot': [0, 5],
  'sensor': [1]
}


Document 3

Posiciones sintácticas de términos
{
  'internet': [0],
  'thing': [1],
  'iot': [2],
  'revolutionizing': [3],
  'world': [4],
  'trillion': [5],
  'sensor': [6],
  'actuator': [7],
  'creating': [8],
  'smart': [9],
  'environment': [10],
  'around': [11]
}

Posiciones sintácticas de query terms
{
  'iot': [2],
  'sensor': [6]
}




## Imprimir Vicinity Matrix (MAT3D) de cada documento

In [8]:
# Imprimir Vicinity Matrix (MAT3D) de cada parrafo
for d in ranking.get_documents():
    print(f"Document {d.get_doc_id()}")
    for s in d.get_sentences():
        print(custom_json_format(s.get_vicinity_matrix()))
    print()

Document 1
{
  'scientific': {"sensor": [0, 1, 0, 0]},
  'research': {"sensor": [1, 0, 0, 0]},
  'considered': {"sensor": [1, 0, 0, 0]},
  'prospective': {"sensor": [0, 1, 0, 0]},
  'field': {"sensor": [0, 0, 1, 0]},
  'innovation': {"sensor": [0, 0, 0, 1]}
}

Document 2
{
  'efficiently': {"iot": [0, 1, 1, 0], "sensor": [1, 0, 0, 0]},
  'used': {"iot": [0, 1, 1, 0], "sensor": [0, 1, 0, 0]},
  'various': {"iot": [1, 0, 0, 1], "sensor": [0, 0, 1, 0]},
  'application': {"iot": [1, 0, 0, 0]},
  'creating': {"iot": [0, 1, 0, 0]},
  'smart': {"iot": [0, 0, 1, 0]},
  'environment': {"iot": [0, 0, 0, 1]}
}

Document 3
{
  'internet': {"iot": [0, 1, 0, 0]},
  'thing': {"iot": [1, 0, 0, 0]},
  'revolutionizing': {"iot": [1, 0, 0, 0], "sensor": [0, 0, 1, 0]},
  'world': {"iot": [0, 1, 0, 0], "sensor": [0, 1, 0, 0]},
  'trillion': {"iot": [0, 0, 1, 0], "sensor": [1, 0, 0, 0]},
  'actuator': {"sensor": [1, 0, 0, 0]},
  'creating': {"sensor": [0, 1, 0, 0]},
  'smart': {"sensor": [0, 0, 1, 0]},
  'e

## Imprimir Proximity Scores y Frequency Scores por cada término de los documentos

In [9]:
# Imprimir Proximity Scores y Frequency Scores por cada término de los documentos
for d in ranking.get_documents():
    print(f"Document {d.get_doc_id()}")
    for s in d.get_sentences():
        print(s.get_graph_by_subquery('iot OR sensor'))
    print()

Document 1
SUBQUERY: iot OR sensor
TERM: research ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: considered ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: prospective ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: scientific ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: field ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.25 ; CRITERIA: proximity
TERM: innovation ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.125 ; CRITERIA: proximity

Document 2
SUBQUERY: iot OR sensor
TERM: various ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: efficiently ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: application ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: used ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.5625 ; CRITERIA: proximity
TERM: creating ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.375 ; CRITERIA: 

## Imprimir Proximity Scores y Frequency Scores del grafo asociado al Ranking

In [10]:
print(ranking.get_graph())

SUBQUERY: iot OR sensor
TERM: research ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: considered ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: various ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: application ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: efficiently ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: creating ; FREQUENCY_SCORE: 1.25 ; PROXIMITY_SCORE: 0.625 ; CRITERIA: proximity
TERM: used ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.5625 ; CRITERIA: proximity
TERM: prospective ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: scientific ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: trillion ; FREQUENCY_SCORE: 0.5 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: thing ; FREQUENCY_SCORE: 0.5 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: actuator ; FREQUENCY_SCORE: 0.5 ; PR

## Comparar Grafo del Usuario con los documentos del ránking

### Generar Grafo del Usuario

In [11]:
# Este grafo es el resultado del Grafo de Usuario que éste modificó, a partir del Grafo Inicial construido a partir del ránking inicial

user_graph = VicinityGraph(query, nr_of_graph_terms, limit_distance, include_query_terms, summarize)
vicinity_node_1 = VicinityNode(term='environment', frequency_score=1.0, proximity_score=1.0, criteria='proximity')
vicinity_node_2 = VicinityNode(term='smart', frequency_score=1.0, proximity_score=1.0, criteria='proximity')
vicinity_node_3 = VicinityNode(term='scientific', frequency_score=1.0, proximity_score=1.0, criteria='proximity')
user_graph.add_node(vicinity_node_1)
user_graph.add_node(vicinity_node_2)
user_graph.add_node(vicinity_node_3)
print(user_graph)

SUBQUERY: iot OR sensor
TERM: environment ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: smart ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: scientific ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity


### Comparar Documento 1 con Grafo del Usuario

In [12]:
doc1_graph = ranking.get_document_by_id('1').get_graph()
doc1_similarity_score = user_graph.get_similarity_score_as_base_graph(doc1_graph)
print(doc1_similarity_score)

cosine_of_prox_angle: 0.1797866299901979
cosine_of_freq_angle: 0.0
0.1797866299901979


### Comparar Documento 2 con Grafo del Usuario

In [13]:
doc2_graph = ranking.get_document_by_id('2').get_graph()
doc2_similarity_score = user_graph.get_similarity_score_as_base_graph(doc2_graph)
print(doc2_similarity_score)

cosine_of_prox_angle: 0.10619884881071831
cosine_of_freq_angle: 0.0
0.10619884881071832


### Comparar Documento 3 con Grafo del Usuario

In [14]:
doc3_graph = ranking.get_document_by_id('3').get_graph()
doc3_similarity_score = user_graph.get_similarity_score_as_base_graph(doc3_graph)
print(doc3_similarity_score)

cosine_of_prox_angle: 0.09853292781642932
cosine_of_freq_angle: 0.0
0.09853292781642932
