# Testing GraphSearch Distance Concepts

## Load Libraries

In [1]:
import numpy as np
import os
import sys
import json
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.srex.ranking import Ranking
from utils.data_utils import DataUtils

## Initialize some variables

In [2]:
# Load Stop Words
stop_words_list = DataUtils.load_stopwords()
results_filename = 'example_test_sim-367-lmsingletermsmartuser.json'
results_data: dict = DataUtils.load_json_file(results_filename)

In [3]:
query                    = results_data["query_terms"]
nr_search_results        = 100
ranking_weight_type      = 'linear' # it can be: 'none', 'linear' or 'inverse'
lema                     = True
stem                     = False
summarize                = 'mean'   # it can be: 'mean' or 'median'
nr_of_graph_terms        = 5
limit_distance           = 4 
include_query_terms      = False

## Calculate Ranking data structure

In [4]:
articles_list = DataUtils.extract_articles_from_json_results(results_data)
#print(json.dumps(articles_list, indent=2))

ranking = Ranking(query_text=query, nr_search_results=nr_search_results, ranking_weight_type=ranking_weight_type, 
                  stop_words=stop_words_list, lemmatization=lema, stemming=stem)
ranking.build_article_dictionaries_list(articles_list)

## Generate all graphs from ranking, documents and sentences

In [5]:
ranking.generate_all_graphs(nr_of_graph_terms, limit_distance, include_query_terms, summarize)

## Imprimir texto crudo y texto preprocesado de cada documento

In [6]:
for index, d in enumerate(ranking.documents):
    print(f"Document {d.doc_id}")
    print(f"raw text: {d.title}")
    for s in d.sentences:
        print(f"preprocessed text: {s.preprocessed_text}")
    print()

Document XIE19990612.0182
raw text: 1999-06-12 06:19  UAE Music Piracy Level Lowest in Middle East ABU DHABI, June 12 (Xinhua) -- The United Arab Emirates (UAE)has the lowest music piracy level in the Middle East following acontinuous crackdown against counterfeiters in the country.The latest report published by the London-based InternationalFederation of the Phonographic Industry has said that piracyacross the Middle East is over 20 percent, and much higher in manycases.The exception is the UAE, where sustained anti-piracyactivity has brought the level down to under 10 percent, theDubai-based Khaleej Times quoted the report as saying Saturday.The Music Piracy '98 report said that music piracy level was10-25 percent in Qatar and 25-50 percent in the rest of GulfCooperation Council countries, such as Saudi Arabia, Oman, Kuwaitand Bahrain.Anti-piracy campaigners in Dubai, the second largest emirateof the UAE, said that in the Middle East, music cassette piracywas especially high, but aud

## Imprimir Proximity Scores y Frequency Scores del grafo asociado al Ranking

In [7]:
for node in ranking.get_graph().get_proximity_nodes_sorted()[:20]:
    print(node)

TERM: software ; FREQUENCY_SCORE: 230.328283 ; PROXIMITY_SCORE: 66.625631 ; CRITERIA: proximity
TERM: anti ; FREQUENCY_SCORE: 49.848484 ; PROXIMITY_SCORE: 43.445075 ; CRITERIA: proximity
TERM: rate ; FREQUENCY_SCORE: 36.590911 ; PROXIMITY_SCORE: 25.219699 ; CRITERIA: proximity
TERM: said ; FREQUENCY_SCORE: 158.742423 ; PROXIMITY_SCORE: 15.861111 ; CRITERIA: proximity
TERM: copyright ; FREQUENCY_SCORE: 62.868686 ; PROXIMITY_SCORE: 13.693814 ; CRITERIA: proximity
TERM: hong ; FREQUENCY_SCORE: 74.212122 ; PROXIMITY_SCORE: 13.348487 ; CRITERIA: proximity
TERM: effort ; FREQUENCY_SCORE: 37.186871 ; PROXIMITY_SCORE: 12.077652 ; CRITERIA: proximity
TERM: china ; FREQUENCY_SCORE: 54.621213 ; PROXIMITY_SCORE: 11.59091 ; CRITERIA: proximity
TERM: level ; FREQUENCY_SCORE: 19.343434 ; PROXIMITY_SCORE: 11.10543 ; CRITERIA: proximity
TERM: xinhua ; FREQUENCY_SCORE: 61.833334 ; PROXIMITY_SCORE: 11.056819 ; CRITERIA: proximity
TERM: case ; FREQUENCY_SCORE: 22.934343 ; PROXIMITY_SCORE: 10.805554 ; CRIT

## Ejecución de modelo Langchain Llama 3.1, para decisión del par término-criterio más eficaz para el re-ranking de resultados

In [None]:
# ---------------------------------------------------------------------------------
# Requisitos:
#   pip install langchain-community langchain-core  # (y, por supuesto, Ollama corriendo)
#   ollama pull llama3.1                            # modelo local
# ---------------------------------------------------------------------------------

from langchain_community.chat_models import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage
import json, re

# --------------------------------------------------------------------
# 1. Datos de entrada (ajusta las variables si ya están en tu entorno)
# --------------------------------------------------------------------

topic: str = results_data["topic"]

top_sorted_proximity_terms = ranking.get_graph().get_terms_from_proximity_nodes_sorted()[:20]
# top_sorted_proximity_terms = ['software', 'anti', 'rate', 'said', 'copyright', 'hong', 'effort', 'china', 'level', 'xinhua', 
# 'case', 'combat', 'fight', 'percent', 'kong', 'music', 'crackdown', 'cost', 'campaign', 'dollar']

# --------------------------------------------------------------------
# 2. Construimos el prompt
# --------------------------------------------------------------------
prompt = f"""
You are assisting an interactive search system.

CONTEXT
-------
• Current query topic (verbatim): {topic.strip()}

• Candidate re-ranking terms (sorted by descending score): {', '.join(top_sorted_proximity_terms)}

• Available criteria:
  - proximity  → the term should occur near "piracy" in relevant documents
  - frequency  → the term should occur frequently in relevant documents
  - exclusion  → the term should be absent from relevant documents

TASK
----
Pick **one** term from the list and **one** criterion that best match that term
to maximise the efficiency of retrieving relevant results that satisfy the topic.

RESPONSE FORMAT (strict JSON):
{{
  "chosen_term": "<term from list>",
  "chosen_criterion": "<proximity|frequency|exclusion>",
  "justification": "<brief rationale>"
}}

Respond only with the JSON object.
"""

# --------------------------------------------------------------------
# 3. Llamamos al modelo local llama3.1
# --------------------------------------------------------------------
chat = ChatOllama(model="llama3.1", temperature=0.1)  # ajusta temperatura según prefieras
response_msg = chat.invoke([HumanMessage(content=prompt)])
response_text = response_msg.content.strip()

print("Raw model output:\n", response_text)

# --------------------------------------------------------------------
# 4. Intentamos parsear el JSON devuelto
# --------------------------------------------------------------------
def safe_parse_json(text: str):
    try:
        # El modelo suele devolver algo estilo ```json ... ```
        # Eliminamos delimitadores markdown si existen
        cleaned = re.sub(r"```[\\s\\S]*?json", "", text, flags=re.IGNORECASE).strip("`\\n ")
        return json.loads(cleaned)
    except json.JSONDecodeError:
        return {"error": "No se pudo parsear JSON", "raw": text}

parsed = safe_parse_json(response_text)
parsed

# parsed = {'chosen_term': 'crackdown',
# 'chosen_criterion': 'proximity',
# 'justification': "A crackdown on piracy is a modern instance of old fashioned piracy, and proximity to the query topic 'piracy' increases the likelihood of relevant results."}

Raw model output:
 {
  "chosen_term": "crackdown",
  "chosen_criterion": "proximity",
  "justification": "A crackdown is a modern instance of taking control, implying a direct action against piracy. Proximity criterion ensures that this term occurs near 'piracy' in relevant documents."
}


{'chosen_term': 'crackdown',
 'chosen_criterion': 'proximity',
 'justification': "A crackdown is a modern instance of taking control, implying a direct action against piracy. Proximity criterion ensures that this term occurs near 'piracy' in relevant documents."}

In [8]:
parsed = {'chosen_term': 'crackdown',
'chosen_criterion': 'proximity',
'justification': "A crackdown on piracy is a modern instance of old fashioned piracy, and proximity to the query topic 'piracy' increases the likelihood of relevant results."}

## Re-rank the previous results, based on LLM term-criteria pair decision

In [9]:
from models.srex.vicinity_graph import VicinityGraph, VicinityNode
from utils.vector_utils import VectorUtils

def initialize_visible_graph(original_node: VicinityNode, chosen_criterion: str) -> VicinityGraph:
    visible_graph = VicinityGraph(subquery="new")
    visible_graph.add_node(VicinityNode(
        term=original_node.term, 
        proximity_score=original_node.proximity_score, 
        frequency_score=original_node.frequency_score, 
        criteria=chosen_criterion)
    )
    return visible_graph

def get_doc_weight_graph_excluded_tuple_list(ranking: Ranking, excluded_vicinity_terms: list[str]) -> list[tuple[float, VicinityGraph, bool]]:
    document_weight_graph_excluded_tuple_list = []
    for document in ranking.documents:
        document_graph = document.get_graph()
        document_is_excluded: bool = any(term in excluded_vicinity_terms for term in document_graph.get_terms_from_all_nodes())
        document_weight_graph_excluded_tuple_list.append( (document.weight, document_graph, document_is_excluded) )    
    return document_weight_graph_excluded_tuple_list

def calculate_similarity_scores(document_weight_graph_tuple_list: list[tuple[float, VicinityGraph, bool]], visible_graph: VicinityGraph ) -> list[float]:
    # Initialize the similarity scores
    similarity_ranking: list[float] = []

    for doc_weight, doc_graph, doc_is_excluded in document_weight_graph_tuple_list:
        if not doc_is_excluded:
            # Calculate the similarity score between the visible graph and the document graph
            # Formula:    similarity_score = (doc_weight * (10 ^ -10)) + (doc_weight * initial_similarity_score)
            similarity_score_between_graphs = 1.0 + (doc_weight * 0.0000000001) + (doc_weight * visible_graph.get_similarity_score_as_base_graph(doc_graph))
        else:
            # If the document is excluded, set the similarity score to a low value (lower than any not-excluded document)
            similarity_score_between_graphs = doc_weight * 0.0000000001
        
        # Add the similarity score to the ranking list
        similarity_ranking.append(similarity_score_between_graphs)
        
    return similarity_ranking

def calculate_new_ranking_positions(similarity_scores) -> list[int]:
    indexed_scores = list(enumerate(similarity_scores,1))     # Enumerate the original list (1-index-based)
    sorted_indexed_values = sorted(indexed_scores, key=lambda x: x[1], reverse=True)    # Sort the list of tuples (original position, similarity score) by the score in descending order
    sorted_indexed_3uples = [(b[0], b[1], a) for a, b in enumerate(sorted_indexed_values, 1)]   # 3-uples -> (orig pos, score, new pos)
    new_rank_positions_tuple = sorted(sorted_indexed_3uples, key=lambda x: x[0], reverse=False)     # Sort ascending by original position
    new_rank_positions = [value[2] for value in new_rank_positions_tuple]   # Get the new position of each 3-uple
    return new_rank_positions


chosen_term: str = parsed['chosen_term']
chosen_criterion: str = parsed['chosen_criterion']

# Get the chosen term node from the original ranking graph
original_node = ranking.get_graph().get_node_by_term(chosen_term)

# Initialize the visible graph
visible_graph = initialize_visible_graph(original_node, chosen_criterion)

# Get the vicinity terms with 'exclusion' criteria from the user graph
excluded_vicinity_terms: list[str] = visible_graph.get_terms_from_exclusion_nodes()

# Initialize a tuple list of graphs, weights and preprocessed text from each document of the ranking
document_weight_graph_excluded_tuple_list = get_doc_weight_graph_excluded_tuple_list(ranking, excluded_vicinity_terms)

# Create similarity scores list
similarity_scores: list[float] = calculate_similarity_scores(document_weight_graph_excluded_tuple_list, visible_graph)

# Calculate new ranking positions list
new_rank_positions = calculate_new_ranking_positions(similarity_scores)

# Reassign "score" and "rank" values ​​to each result from the JSON file
for idx, result in enumerate(results_data["results"]):
    result["rank"] = new_rank_positions[idx]
    result["score"] = similarity_scores[idx]

# Sort each result by its rank on the results dict
results_data["results"].sort(key=lambda r: r["rank"])


In [10]:
DataUtils.write_dict_to_json(results_data, "example_rerank_sim-367-lmsingletermsmartuser.json")