# Testing GraphSearch Distance Concepts

## Load Libraries

In [1]:
import numpy as np
import os
import sys
import json
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from models.srex.ranking import Ranking
from utils.data_utils import DataUtils

## Initialize some variables

In [2]:
# Load Stop Words
stop_words_list = DataUtils.load_stopwords()
simulation_output_config_filename = 'simulation_rerank_first_query'
simulation_input_config_filename = 'simulation_first_query'
topic_id = '367'
user_id = 'trec_llm_llama31_result_user'
results_filename = f"{simulation_input_config_filename}-{topic_id}-{user_id}.json"

results_data: dict = DataUtils.load_json_file(results_filename)

In [3]:
query                    = " AND ".join(results_data["query_terms"].split())
nr_search_results        = 100
ranking_weight_type      = 'linear' # it can be: 'none', 'linear' or 'inverse'
lema                     = True
stem                     = False
summarize                = 'mean'   # it can be: 'mean' or 'median'
nr_of_graph_terms        = 5
limit_distance           = 4 
include_query_terms      = False

## Calculate Ranking data structure

In [4]:
articles_list = DataUtils.extract_articles_from_json_results(results_data)
#print(json.dumps(articles_list, indent=2))

ranking = Ranking(query_text=query, nr_search_results=nr_search_results, ranking_weight_type=ranking_weight_type, 
                  stop_words=stop_words_list, lemmatization=lema, stemming=stem)
ranking.build_article_dictionaries_list(articles_list)

## Generate all graphs from ranking, documents and sentences

In [5]:
ranking.generate_all_graphs(nr_of_graph_terms, limit_distance, include_query_terms, summarize)

## Imprimir texto crudo y texto preprocesado de cada documento

In [6]:
for index, d in enumerate(ranking.documents):
    print(f"Document {d.doc_id}")
    print(f"raw text: {d.title}")
    for s in d.sentences:
        print(f"preprocessed text: {s.preprocessed_text}")
    print()

Document XIE19990612.0182
raw text: 1999-06-12 06:19   UAE Music Piracy Level Lowest in Middle East  The United Arab Emirates (UAE) has the lowest music piracy level in the Middle East following a continuous crackdown against counterfeiters in the country. The latest report published by the London-based International Federation of the Phonographic Industry has said that piracy across the Middle East is over 20 percent, and much higher in many cases. The exception is the UAE, where sustained anti-piracy activity has brought the level down to under 10 percent, the Dubai-based Khaleej Times quoted the report as saying Saturday. The Music Piracy '98 report said that music piracy level was 10-25 percent in Qatar and 25-50 percent in the rest of Gulf Cooperation Council countries, such as Saudi Arabia, Oman, Kuwait and Bahrain. Anti-piracy campaigners in Dubai, the second largest emirate of the UAE, said that in the Middle East, music cassette piracy was especially high, but audio-piracy rat

## Imprimir Proximity Scores y Frequency Scores del grafo asociado al Ranking

In [7]:
for node in ranking.get_graph().get_proximity_nodes_sorted():
    print(node)

TERM: software ; FREQUENCY_SCORE: 284.651514 ; PROXIMITY_SCORE: 85.560605 ; CRITERIA: proximity
TERM: anti ; FREQUENCY_SCORE: 56.661616 ; PROXIMITY_SCORE: 56.271466 ; CRITERIA: proximity
TERM: rate ; FREQUENCY_SCORE: 41.838385 ; PROXIMITY_SCORE: 30.440027 ; CRITERIA: proximity
TERM: copyright ; FREQUENCY_SCORE: 78.065657 ; PROXIMITY_SCORE: 22.920454 ; CRITERIA: proximity
TERM: china ; FREQUENCY_SCORE: 66.550505 ; PROXIMITY_SCORE: 16.151516 ; CRITERIA: proximity
TERM: effort ; FREQUENCY_SCORE: 42.702022 ; PROXIMITY_SCORE: 15.650253 ; CRITERIA: proximity
TERM: combat ; FREQUENCY_SCORE: 19.631312 ; PROXIMITY_SCORE: 14.76578 ; CRITERIA: proximity
TERM: hong ; FREQUENCY_SCORE: 69.333333 ; PROXIMITY_SCORE: 13.798612 ; CRITERIA: proximity
TERM: fight ; FREQUENCY_SCORE: 24.227271 ; PROXIMITY_SCORE: 13.693182 ; CRITERIA: proximity
TERM: level ; FREQUENCY_SCORE: 24.308082 ; PROXIMITY_SCORE: 13.570076 ; CRITERIA: proximity
TERM: case ; FREQUENCY_SCORE: 26.111109 ; PROXIMITY_SCORE: 13.31313 ; CRIT

## Ejecución de modelo Langchain Llama 3.1, para decisión del par término-criterio más eficaz para el re-ranking de resultados

In [None]:
from langchain_community.chat_models import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage
import json, re


# 1. Datos de entrada (ajusta las variables si ya están en tu entorno)

topic: str = results_data["topic"]
top_sorted_proximity_terms = ranking.get_graph().get_terms_from_proximity_nodes_sorted()[:100]
# print(top_sorted_proximity_terms) -> ['software', 'anti', 'rate', 'said', 'copyright', 'hong', 'effort', 'china', 'level', 'xinhua', 
# 'case', 'combat', 'fight', 'percent', 'kong', 'music', 'crackdown', 'cost', 'campaign', 'dollar', ...]


# 2. Construimos el prompt

prompt = f"""
You are assisting an interactive search system.

CONTEXT
-------
• Current query topic (verbatim): 
---- START OF TOPIC ----
{topic.strip()}
---- END OF TOPIC ----

• 100 candidate re-ranking terms (sorted by descending score, the first ones have a bigger impact on re-ranking.): [{', '.join(top_sorted_proximity_terms)}]

• Available criteria:
  - proximity  → the term should occur near a query term in relevant documents
  - frequency  → the term should occur frequently in relevant documents
  - exclusion  → the term should be absent from relevant documents

TASK
----

1. Select **between 3 and 5** terms from the candidate re-ranking terms list.
2. Assign to each term one of the three criteria above that best maximizes re-ranking precision and recall for the topic.
3. **At least one** of the terms must use the **exclusion** criterion, based on the topic specifications.
4. **At least one** of the terms must use **either** the **proximity** **or** the **frequency** criterion.
5. Each term **MUST** belong to the list of candidates mentioned above
6. You must return **between 3 and 5** JSON objects, **inside a JSON list**. 

RESPONSE FORMAT (strict JSON list):
[
  {{
    "chosen_term": "<term from candidate re-ranking terms list or your own>",
    "chosen_criterion": "<proximity|frequency|exclusion>",
    "justification": "<brief rationale>"
  }},
  {{ ... }}   # repeat until you get 3-5 objects
]

Respond **only** with the JSON list, no extra text.
"""


# 3. Llamamos al modelo local llama3.1

chat = ChatOllama(model="llama3.1", temperature=0.1)  # ajusta temperatura según prefieras
response_msg = chat.invoke([HumanMessage(content=prompt)])
response_text = response_msg.content.strip()

print("Raw model output:\n", response_text)


# 4. Intentamos parsear el JSON devuelto

def safe_parse_json(text: str):
    try:
        # El modelo suele devolver algo estilo ```json ... ```
        # Eliminamos delimitadores markdown si existen
        cleaned = re.sub(r"```[\\s\\S]*?json", "", text, flags=re.IGNORECASE).strip("`\\n ")
        return json.loads(cleaned)
    except json.JSONDecodeError:
        return {"error": "No se pudo parsear JSON", "raw": text}

parsed_chosen_terms_list: list[dict[str, str]] = safe_parse_json(response_text)
parsed_chosen_terms_list

validated_parsed_chosen_terms_list = []
for term_dict in parsed_chosen_terms_list:
  if ranking.get_graph().get_node_by_term(term_dict["chosen_term"]) is not None:
    validated_parsed_chosen_terms_list.append(term_dict)

validated_parsed_chosen_terms_list

Raw model output:
 [
  {
    "chosen_term": "combat",
    "chosen_criterion": "frequency",
    "justification": "Frequent mention of combat in relevant documents related to maritime piracy"
  },
  {
    "chosen_term": "sea",
    "chosen_criterion": "proximity",
    "justification": "Proximity to query term 'piracy' and relevance to maritime context"
  },
  {
    "chosen_term": "software",
    "chosen_criterion": "exclusion",
    "justification": "Exclusion criterion to filter out non-relevant documents discussing software piracy"
  }
]


[{'chosen_term': 'combat',
  'chosen_criterion': 'frequency',
  'justification': 'Frequent mention of combat in relevant documents related to maritime piracy'},
 {'chosen_term': 'sea',
  'chosen_criterion': 'proximity',
  'justification': "Proximity to query term 'piracy' and relevance to maritime context"},
 {'chosen_term': 'software',
  'chosen_criterion': 'exclusion',
  'justification': 'Exclusion criterion to filter out non-relevant documents discussing software piracy'}]

In [None]:
# parsed_chosen_terms_list = [
#     {
#     'chosen_term': 'maritime',
#     'chosen_criterion': 'frequency',
#     'justification': "Software is not related to maritime piracy."
#     }
# ]

#parsed_chosen_terms_list = {'chosen_term': 'crackdown',
#'chosen_criterion': 'proximity',
#'justification': "A crackdown on piracy is a modern instance of old fashioned piracy, and proximity to the query topic 'piracy' increases the likelihood of relevant results."}

['software',
 'anti',
 'rate',
 'copyright',
 'hong',
 'effort',
 'china',
 'level',
 'xinhua',
 'case',
 'combat',
 'percent',
 'fight',
 'kong',
 'music',
 'cost',
 'crackdown',
 'dollar',
 'campaign',
 'worldwide',
 'law',
 'operation',
 'activity',
 'beijing',
 'microsoft',
 'first',
 'government',
 'cd',
 'crack',
 'country',
 'rampant',
 'loss',
 'billion',
 'macao',
 'internet',
 'fighting',
 'problem',
 'computer',
 'action',
 'digital',
 'million',
 'year',
 'audio',
 'launched',
 'new',
 'report',
 'film',
 'industry',
 'continues',
 'combating',
 'global',
 'stamp',
 'high',
 'january',
 'estimate',
 'group',
 'africa',
 'video',
 'illegal',
 'last',
 'hit',
 'cracking',
 'measure',
 'may',
 'growing',
 'still',
 'lost',
 'enforcement',
 'estimated',
 'state',
 'asia',
 'charge',
 'major',
 'step',
 'company',
 'win',
 'alliance',
 'challenge',
 'syndicate',
 'highest',
 'issue',
 'wednesday',
 'business',
 'pirated',
 'system',
 'technology',
 'strong',
 'curb',
 'internati

## Re-rank the previous results, based on LLM term-criteria pair decision

In [None]:
from backend.app.models.srex.term_graph import TermGraph, TGNode
from utils.vector_utils import VectorUtils

def initialize_visible_graph(parsed_chosen_terms_list: list[dict[str, str]]) -> TermGraph:
    visible_graph = TermGraph(subquery="new")
    for term in parsed_chosen_terms_list:
        # Get the chosen term node from the original ranking graph
        original_node = ranking.get_graph().get_node_by_term(term["chosen_term"])
        if original_node is not None:   # Check if the node exists
            visible_graph.add_node(TGNode(
                term=original_node.term, 
                proximity_score=original_node.proximity_score, 
                frequency_score=original_node.frequency_score, 
                criteria=term["chosen_criterion"])
            )
    return visible_graph

def get_doc_weight_graph_excluded_tuple_list(ranking: Ranking, excluded_vicinity_terms: list[str]) -> list[tuple[float, TermGraph, bool]]:
    document_weight_graph_excluded_tuple_list = []
    for document in ranking.documents:
        document_graph = document.get_graph()
        document_is_excluded: bool = any(term in excluded_vicinity_terms for term in document_graph.get_terms_from_all_nodes())
        document_weight_graph_excluded_tuple_list.append( (document.weight, document_graph, document_is_excluded) )    
    return document_weight_graph_excluded_tuple_list

def calculate_similarity_scores(document_weight_graph_tuple_list: list[tuple[float, TermGraph, bool]], visible_graph: TermGraph ) -> list[float]:
    # Initialize the similarity scores
    similarity_ranking: list[float] = []

    for doc_weight, doc_graph, doc_is_excluded in document_weight_graph_tuple_list:
        if not doc_is_excluded:
            # Calculate the similarity score between the visible graph and the document graph
            # Formula:    similarity_score = (doc_weight * (10 ^ -10)) + (doc_weight * initial_similarity_score)
            similarity_score_between_graphs = 1.0 + (doc_weight * 0.0000000001) + (doc_weight * visible_graph.get_similarity_score_as_base_graph(doc_graph))
        else:
            # If the document is excluded, set the similarity score to a low value (lower than any not-excluded document)
            similarity_score_between_graphs = doc_weight * 0.0000000001
        
        # Add the similarity score to the ranking list
        similarity_ranking.append(similarity_score_between_graphs)
        
    return similarity_ranking

def calculate_new_ranking_positions(similarity_scores) -> list[int]:
    indexed_scores = list(enumerate(similarity_scores,1))     # Enumerate the original list (1-index-based)
    sorted_indexed_values = sorted(indexed_scores, key=lambda x: x[1], reverse=True)    # Sort the list of tuples (original position, similarity score) by the score in descending order
    sorted_indexed_3uples = [(b[0], b[1], a) for a, b in enumerate(sorted_indexed_values, 1)]   # 3-uples -> (orig pos, score, new pos)
    new_rank_positions_tuple = sorted(sorted_indexed_3uples, key=lambda x: x[0], reverse=False)     # Sort ascending by original position
    new_rank_positions = [value[2] for value in new_rank_positions_tuple]   # Get the new position of each 3-uple
    return new_rank_positions



# Initialize the visible graph
visible_graph = initialize_visible_graph(parsed_chosen_terms_list)

if len(visible_graph.get_all_nodes_sorted()) > 0:
    # Get the vicinity terms with 'exclusion' criteria from the user graph
    excluded_vicinity_terms: list[str] = visible_graph.get_terms_from_exclusion_nodes()

    # Initialize a tuple list of graphs, weights and preprocessed text from each document of the ranking
    document_weight_graph_excluded_tuple_list = get_doc_weight_graph_excluded_tuple_list(ranking, excluded_vicinity_terms)

    # Create similarity scores list
    similarity_scores: list[float] = calculate_similarity_scores(document_weight_graph_excluded_tuple_list, visible_graph)

    # Calculate new ranking positions list
    new_rank_positions = calculate_new_ranking_positions(similarity_scores)

    # Reassign "score" and "rank" values ​​to each result from the JSON file
    for idx, result in enumerate(results_data["results"]):
        result["rank"] = new_rank_positions[idx]
        result["score"] = similarity_scores[idx]

    # Sort each result by its rank on the results dict
    results_data["results"].sort(key=lambda r: r["rank"])
    print("Re-ranked results.")

else:
    print("Warning: The visible graph has no nodes.")

Re-ranked results.


## Export JSON to data/ directory

In [10]:
reranked_json_results_filename = f"{simulation_output_config_filename}-{topic_id}-{user_id}.json"
DataUtils.write_dict_to_json(results_data, reranked_json_results_filename)