# Testing GraphSearch Distance Concepts

## Load Libraries

In [1]:
import numpy as np
import os
import sys
import json
import copy
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.srex.ranking import Ranking
from models.srex.vicinity_graph import VicinityGraph, VicinityNode
from utils.data_utils import DataUtils

## Initialize some variables

In [2]:
# Load Stop Words
stop_words_list = DataUtils.load_stopwords()

In [3]:
query                    = 'iot'
nr_search_results        = 3
ranking_weight_type      = 'linear' # it can be: 'none', 'linear' or 'inverse'
lema                     = True
stem                     = False
summarize                = 'mean'   # it can be: 'mean' or 'median'
nr_of_graph_terms        = 5
limit_distance           = 4 
include_query_terms      = False

## Calculate Ranking data structure

In [4]:
document_1 = 'Securing IoT Devices and Connecting the Dots Using REST API and Middleware'
document_2 = 'Therefore it is required to introduce a secure IoT system which doesnt allow attackers infiltration in the network through IoT devices and also to secure data in transit from IoT devices to cloud.'
document_3 = 'Internet of Things (IoT) is a fairly disruptive technology with inconceivable growth, impact, and capability.'

articles_list = [
    {'title': document_1, 'article_number': '1'}, 
    {'title': document_2, 'article_number': '2'}, 
    {'title': document_3, 'article_number': '3'}
]

ranking = Ranking(query, nr_search_results, ranking_weight_type, stop_words_list, lema, stem)
ranking.build_article_dictionaries_list(articles_list)

## Generate all graphs from ranking, documents and sentences

In [5]:
ranking.generate_all_graphs(nr_of_graph_terms, limit_distance, include_query_terms, summarize)

## Imprimir texto crudo y texto preprocesado de cada documento

In [6]:
for index, d in enumerate(ranking.documents):
    print(f"Document {d.doc_id}")
    print(f"raw text: {d.title}")
    for s in d.sentences:
        print(f"preprocessed text: {s.preprocessed_text}")
    print()

Document 1
raw text: Securing IoT Devices and Connecting the Dots Using REST API and Middleware
preprocessed text: securing iot device connecting dot rest api middleware

Document 2
raw text: Therefore it is required to introduce a secure IoT system which doesnt allow attackers infiltration in the network through IoT devices and also to secure data in transit from IoT devices to cloud.
preprocessed text: required introduce secure iot system allow attacker infiltration network iot device secure data transit iot device cloud

Document 3
raw text: Internet of Things (IoT) is a fairly disruptive technology with inconceivable growth, impact, and capability.
preprocessed text: internet thing iot fairly disruptive technology inconceivable growth impact capability



## Imprimir diccionario de posiciones de términos por cada documento

In [7]:
# Función personalizada para manejar el formato
def custom_json_format(data):
    formatted = "{\n"
    for key, value in data.items():
        # Convierte cada par clave-valor en una línea con el formato deseado
        formatted += f"  '{key}': {json.dumps(value)},\n"
    formatted = formatted.rstrip(",\n") + "\n}"  # Elimina la última coma y cierra el bloque
    return formatted


# Imprimir diccionario de posiciones de términos por cada documento
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        term_positions_dict = s.get_term_positions_dict()
        query_terms = s.query_tree.get_query_terms_str_list_with_underscores()
        query_term_positions_dict = s.get_query_term_positions_dict(term_positions_dict, query_terms)
        
        print('\nPosiciones sintácticas de términos')
        print(custom_json_format(term_positions_dict))
        print('\nPosiciones sintácticas de query terms')
        print(f"{custom_json_format(query_term_positions_dict)}\n")
    print()

Document 1

Posiciones sintácticas de términos
{
  'securing': [0],
  'iot': [1],
  'device': [2],
  'connecting': [3],
  'dot': [4],
  'rest': [5],
  'api': [6],
  'middleware': [7]
}

Posiciones sintácticas de query terms
{
  'iot': [1]
}


Document 2

Posiciones sintácticas de términos
{
  'required': [0],
  'introduce': [1],
  'secure': [2, 11],
  'iot': [3, 9, 14],
  'system': [4],
  'allow': [5],
  'attacker': [6],
  'infiltration': [7],
  'network': [8],
  'device': [10, 15],
  'data': [12],
  'transit': [13],
  'cloud': [16]
}

Posiciones sintácticas de query terms
{
  'iot': [3, 9, 14]
}


Document 3

Posiciones sintácticas de términos
{
  'internet': [0],
  'thing': [1],
  'iot': [2],
  'fairly': [3],
  'disruptive': [4],
  'technology': [5],
  'inconceivable': [6],
  'growth': [7],
  'impact': [8],
  'capability': [9]
}

Posiciones sintácticas de query terms
{
  'iot': [2]
}




## Imprimir Vicinity Matrix (MAT3D) de cada documento

In [8]:
# Imprimir Vicinity Matrix (MAT3D) de cada parrafo
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        print(custom_json_format(s.vicinity_matrix))
    print()

Document 1
{
  'securing': {"iot": [1, 0, 0, 0]},
  'device': {"iot": [1, 0, 0, 0]},
  'connecting': {"iot": [0, 1, 0, 0]},
  'dot': {"iot": [0, 0, 1, 0]},
  'rest': {"iot": [0, 0, 0, 1]}
}

Document 2
{
  'required': {"iot": [0, 0, 1, 0]},
  'introduce': {"iot": [0, 1, 0, 0]},
  'secure': {"iot": [1, 1, 1, 0]},
  'system': {"iot": [1, 0, 0, 0]},
  'allow': {"iot": [0, 1, 0, 1]},
  'attacker': {"iot": [0, 0, 2, 0]},
  'infiltration': {"iot": [0, 1, 0, 1]},
  'network': {"iot": [1, 0, 0, 0]},
  'device': {"iot": [2, 0, 0, 1]},
  'data': {"iot": [0, 1, 1, 0]},
  'transit': {"iot": [1, 0, 0, 1]},
  'cloud': {"iot": [0, 1, 0, 0]}
}

Document 3
{
  'internet': {"iot": [0, 1, 0, 0]},
  'thing': {"iot": [1, 0, 0, 0]},
  'fairly': {"iot": [1, 0, 0, 0]},
  'disruptive': {"iot": [0, 1, 0, 0]},
  'technology': {"iot": [0, 0, 1, 0]},
  'inconceivable': {"iot": [0, 0, 0, 1]}
}



## Imprimir Proximity Scores y Frequency Scores por cada término de los documentos

In [9]:
# Imprimir Proximity Scores y Frequency Scores por cada término de los documentos
for d in ranking.documents:
    print(f"Document {d.doc_id}")
    for s in d.sentences:
        print(s.get_graph())
    print()

Document 1
SUBQUERY: iot
TERM: securing ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: device ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: connecting ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: dot ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.25 ; CRITERIA: proximity
TERM: rest ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.125 ; CRITERIA: proximity
TERM: api ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.0 ; CRITERIA: frequency
TERM: middleware ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.0 ; CRITERIA: frequency

Document 2
SUBQUERY: iot
TERM: device ; FREQUENCY_SCORE: 1.5 ; PROXIMITY_SCORE: 1.59375 ; CRITERIA: proximity
TERM: secure ; FREQUENCY_SCORE: 1.5 ; PROXIMITY_SCORE: 1.3125 ; CRITERIA: proximity
TERM: transit ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: system ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: network ; FREQUENCY_SCORE: 0.75 

## Imprimir Proximity Scores y Frequency Scores del grafo asociado al Ranking

In [10]:
print(ranking.get_graph())

SUBQUERY: iot
TERM: device ; FREQUENCY_SCORE: 2.5 ; PROXIMITY_SCORE: 2.59375 ; CRITERIA: proximity
TERM: secure ; FREQUENCY_SCORE: 1.5 ; PROXIMITY_SCORE: 1.3125 ; CRITERIA: proximity
TERM: securing ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: transit ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.84375 ; CRITERIA: proximity
TERM: network ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: system ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.75 ; CRITERIA: proximity
TERM: data ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.5625 ; CRITERIA: proximity
TERM: connecting ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: thing ; FREQUENCY_SCORE: 0.5 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: fairly ; FREQUENCY_SCORE: 0.5 ; PROXIMITY_SCORE: 0.5 ; CRITERIA: proximity
TERM: infiltration ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.46875 ; CRITERIA: proximity
TERM: allow ; FREQUENCY_SCORE: 0.75 ; PROXIMITY_SCORE: 0.46

## Comparar Grafo del Usuario con los documentos del ránking

### Generar Grafo del Usuario

In [11]:
# Este grafo es el resultado del Grafo de Usuario que éste modificó, a partir del Grafo Inicial construido a partir del ránking inicial

user_graph = VicinityGraph(query, nr_of_graph_terms, limit_distance, include_query_terms, summarize)
vicinity_node_1 = VicinityNode(term='technology', frequency_score=1.0, proximity_score=1.0, criteria='proximity')
vicinity_node_2 = VicinityNode(term='device', frequency_score=1.0, proximity_score=1.0, criteria='proximity')
vicinity_node_3 = VicinityNode(term='disruptive', frequency_score=1.0, proximity_score=1.0, criteria='frequency')
user_graph.add_node(vicinity_node_1)
user_graph.add_node(vicinity_node_2)
user_graph.add_node(vicinity_node_3)
print(user_graph)

SUBQUERY: iot
TERM: technology ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: device ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: proximity
TERM: disruptive ; FREQUENCY_SCORE: 1.0 ; PROXIMITY_SCORE: 1.0 ; CRITERIA: frequency


### Comparar Documento 1 con Grafo del Usuario

In [12]:
doc1_graph = ranking.get_document_by_id('1').get_graph()
doc1_similarity_score = user_graph.get_similarity_score_as_base_graph(doc1_graph)
print(doc1_similarity_score)

0.30895170472719585


### Comparar Documento 2 con Grafo del Usuario

In [13]:
doc2_graph = ranking.get_document_by_id('2').get_graph()
doc2_similarity_score = user_graph.get_similarity_score_as_base_graph(doc2_graph)
print(doc2_similarity_score)

0.2778311914457607


### Comparar Documento 3 con Grafo del Usuario

In [14]:
doc3_graph = ranking.get_document_by_id('3').get_graph()
doc3_similarity_score = user_graph.get_similarity_score_as_base_graph(doc3_graph)
print(doc3_similarity_score)

0.1845086954528685
