# Testing GraphSearch Distance Concepts

## Load Libraries

In [1]:
import numpy as np
import os
import sys
import json
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.srex.ranking import Ranking
from utils.data_utils import DataUtils

## Initialize some variables

In [2]:
# Load Stop Words
stop_words_list = DataUtils.load_stopwords()

In [3]:
query                    = 'iot AND sensor'
nr_search_results        = 3
ranking_weight_type      = 'linear' # it can be: 'none', 'linear' or 'inverse'
lema                     = True
stem                     = False
summarize                = 'mean'   # it can be: 'mean' or 'median'
nr_of_graph_terms        = 5
limit_distance           = 4 
include_query_terms      = False

## Calculate Ranking data structure

In [4]:
document_1 = 'Furthermore, after analyzing different sensor applications, this article enlightens which IoT application requires which type of sensor'
document_2 = 'IoT sensors are efficiently used in various IoT applications for creating a smart environment by collecting real time data.'
document_3 = 'Internet of Things (IoT) is revolutionizing our world with trillions of sensors and actuators by creating a smart environment around us.'

articles_list = [
    {'title': document_1, 'article_number': '1'}, 
    {'title': document_2, 'article_number': '2'}, 
    {'title': document_3, 'article_number': '3'}
]

ranking = Ranking(query, nr_search_results, ranking_weight_type, stop_words_list, lema, stem)
ranking.build_article_dictionaries_list(articles_list)

## Generate all graphs from ranking, documents and sentences

In [5]:
ranking.generate_all_graphs(nr_of_graph_terms, limit_distance, include_query_terms, summarize)

## Imprimir texto crudo y texto preprocesado de cada documento

In [6]:
for index, d in enumerate(ranking.documents):
    print(f"Document {d.doc_id}")
    print(f"raw text: {d.title}")
    for s in d.sentences:
        print(f"preprocessed text: {s.preprocessed_text}")
    print()

Document 1
raw text: Furthermore, after analyzing different sensor applications, this article enlightens which IoT application requires which type of sensor
preprocessed text: analyzing different sensor application article enlightens iot application requires type sensor

Document 2
raw text: IoT sensors are efficiently used in various IoT applications for creating a smart environment by collecting real time data.
preprocessed text: iot sensor efficiently used various iot application creating smart environment collecting real time data

Document 3
raw text: Internet of Things (IoT) is revolutionizing our world with trillions of sensors and actuators by creating a smart environment around us.
preprocessed text: internet thing iot revolutionizing world trillion sensor actuator creating smart environment around



## Imprimir diccionario de posiciones de términos por cada documento

In [7]:
# Función personalizada para manejar el formato
def custom_json_format(data):
    formatted = "{\n"
    for key, value in data.items():
        # Convierte cada par clave-valor en una línea con el formato deseado
        formatted += f"  '{key}': {json.dumps(value)},\n"
    formatted = formatted.rstrip(",\n") + "\n}"  # Elimina la última coma y cierra el bloque
    return formatted


# Imprimir diccionario de posiciones de términos por cada documento
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        term_positions_dict = s.get_term_positions_dict()
        query_terms = s.query_tree.get_query_terms_str_list_with_underscores()
        query_term_positions_dict = s.get_query_term_positions_dict(term_positions_dict, query_terms)
        
        print('\nPosiciones sintácticas de términos')
        print(custom_json_format(term_positions_dict))
        print('\nPosiciones sintácticas de query terms')
        print(f"{custom_json_format(query_term_positions_dict)}\n")
    print()

Document 1

Posiciones sintácticas de términos
{
  'analyzing': [0],
  'different': [1],
  'sensor': [2, 10],
  'application': [3, 7],
  'article': [4],
  'enlightens': [5],
  'iot': [6],
  'requires': [8],
  'type': [9]
}

Posiciones sintácticas de query terms
{
  'iot': [6],
  'sensor': [2, 10]
}


Document 2

Posiciones sintácticas de términos
{
  'iot': [0, 5],
  'sensor': [1],
  'efficiently': [2],
  'used': [3],
  'various': [4],
  'application': [6],
  'creating': [7],
  'smart': [8],
  'environment': [9],
  'collecting': [10],
  'real': [11],
  'time': [12],
  'data': [13]
}

Posiciones sintácticas de query terms
{
  'iot': [0, 5],
  'sensor': [1]
}


Document 3

Posiciones sintácticas de términos
{
  'internet': [0],
  'thing': [1],
  'iot': [2],
  'revolutionizing': [3],
  'world': [4],
  'trillion': [5],
  'sensor': [6],
  'actuator': [7],
  'creating': [8],
  'smart': [9],
  'environment': [10],
  'around': [11]
}

Posiciones sintácticas de query terms
{
  'iot': [2],
  'se

## Imprimir Vicinity Matrix (MAT3D) de cada documento

In [8]:
# Imprimir Vicinity Matrix (MAT3D) de cada parrafo
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        print(custom_json_format(s.vicinity_matrix))
    print()

Document 1
{
  'analyzing': {"sensor": [0, 1, 0, 0]},
  'different': {"sensor": [1, 0, 0, 0]},
  'application': {"iot": [1, 0, 1, 0], "sensor": [1, 0, 1, 0]},
  'article': {"iot": [0, 1, 0, 0], "sensor": [0, 1, 0, 0]},
  'enlightens': {"iot": [1, 0, 0, 0], "sensor": [0, 0, 1, 0]},
  'requires': {"iot": [0, 1, 0, 0], "sensor": [0, 1, 0, 0]},
  'type': {"iot": [0, 0, 1, 0], "sensor": [1, 0, 0, 0]}
}

Document 2
{
  'efficiently': {"iot": [0, 1, 1, 0], "sensor": [1, 0, 0, 0]},
  'used': {"iot": [0, 1, 1, 0], "sensor": [0, 1, 0, 0]},
  'various': {"iot": [1, 0, 0, 1], "sensor": [0, 0, 1, 0]},
  'application': {"iot": [1, 0, 0, 0]},
  'creating': {"iot": [0, 1, 0, 0]},
  'smart': {"iot": [0, 0, 1, 0]},
  'environment': {"iot": [0, 0, 0, 1]}
}

Document 3
{
  'internet': {"iot": [0, 1, 0, 0]},
  'thing': {"iot": [1, 0, 0, 0]},
  'revolutionizing': {"iot": [1, 0, 0, 0], "sensor": [0, 0, 1, 0]},
  'world': {"iot": [0, 1, 0, 0], "sensor": [0, 1, 0, 0]},
  'trillion': {"iot": [0, 0, 1, 0], "sens

## Imprimir Proximity Scores y Frequency Scores por cada término de los documentos

In [9]:
# Imprimir Proximity Scores y Frequency Scores por cada término de los documentos
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        print(s.get_graph())
    print()

Document 1
SUBQUERY: iot AND sensor
TERM: application ; FREQUENCY_SCORE: 2.0 ; PROXIMITY_SCORE: 1.25 ; CRITERIA: proximity
TERM: enlightens ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: type ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: different ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: article ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: requires ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: analyzing ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity

Document 2
SUBQUERY: iot AND sensor
TERM: various ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: efficiently ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: application ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: used ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.5625 ; CRITERIA: proximi

## Imprimir Proximity Scores y Frequency Scores del grafo asociado al Ranking

In [10]:
print(ranking.get_graph())

SUBQUERY: iot AND sensor
TERM: application ; FREQUENCY_SCORE: 2.75 ; PROXIMITY_SCORE: 2.0 ; CRITERIA: proximity
TERM: enlightens ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: type ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: different ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: various ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: efficiently ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: creating ; FREQUENCY_SCORE: 1.25 ; PROXIMITY_SCORE: 0.625 ; CRITERIA: proximity
TERM: used ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.5625 ; CRITERIA: proximity
TERM: article ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: requires ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: analyzing ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: thing ; FREQUENCY_SCORE: 0.5 ; PROXIMITY_