In [26]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
import nest_asyncio, os
from dotenv import find_dotenv, load_dotenv
from llama_index.core.evaluation import FaithfulnessEvaluator, AnswerRelevancyEvaluator

#### Read Documents

In [2]:
INPUT_DIR = "data/"

In [3]:
reader = SimpleDirectoryReader(input_dir=INPUT_DIR)
documents = reader.load_data()

In [4]:
type(documents), len(documents), documents

(list,
 10,
 [Document(id_='7bd775d9-9277-4e61-bb67-e7992afddcea', embedding=None, metadata={'page_label': '1', 'file_name': 'fake_news_detection_spanish_DL.pdf', 'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf', 'file_type': 'application/pdf', 'file_size': 247472, 'creation_date': '2025-09-08', 'last_modified_date': '2025-09-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='FAKE NEWS DETECTION IN SPANISH USING DEEP LEARNING\nTECHNIQUES\nKevin Martínez-Gallego\nIntelligent Information Systems Lab\nUniversidad de Antioquia\nCalle 67 No. 53 - 108, 050010, Medellín, Colombia

#### Splitting, Indexing and Storing

In [5]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection(name="nlp_papers")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

2025-09-07 21:21:57,408 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [None]:
# create the pipeline with transformations
pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_overlap=0),
        HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
    ],
    vector_store=vector_store
)

nodes = await pipeline.arun(documents=documents)

2025-09-07 21:22:15,591 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-09-07 21:22:17,879 - INFO - 1 prompt is loaded, with the key: query


In [7]:
type(nodes), len(nodes), nodes

(list,
 13,
 [TextNode(id_='5ca009f8-a815-4e1d-9117-cb73b2d0e902', embedding=[-0.03555683046579361, 0.00716931838542223, -0.02077505737543106, 0.056294530630111694, 0.08981095254421234, 0.0013609180459752679, 0.007944142445921898, 0.005815392360091209, 0.0328463576734066, 0.0034827161580324173, 0.020707139745354652, -0.014285522513091564, -0.027410371229052544, -0.00921441800892353, 0.04423157870769501, 0.021918274462223053, 0.021748002618551254, -0.07639149576425552, 0.020983949303627014, -0.02315150387585163, 0.06920871138572693, 0.006815837696194649, 0.01937733218073845, 0.02336173504590988, 0.003909441642463207, -0.0072477892972528934, 0.007037547416985035, 0.005949437152594328, -0.04405679553747177, -0.19036178290843964, 0.02314697951078415, -0.016400987282395363, 0.05768774077296257, -0.0362810380756855, -0.002133144298568368, 0.01096255797892809, -0.03683725371956825, -0.04325534403324127, -0.05004477873444557, 0.015324688516557217, -0.011126919649541378, -0.017206808552145958, 

In [8]:
len(nodes[0].embedding) # Embeddings dimension

384

#### Storing and Indexing

#### Querying

In [9]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") # Must be the same used in training for query-match consistency
index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embed_model)

2025-09-07 21:22:50,090 - INFO - Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2025-09-07 21:22:52,362 - INFO - 1 prompt is loaded, with the key: query


In [13]:
load_dotenv(find_dotenv())
hf_token = os.getenv("HF_TOKEN")

In [14]:
nest_asyncio.apply()  # This is needed to run the query engine

In [15]:
llm = HuggingFaceInferenceAPI(
    model_name="Qwen/Qwen2.5-Coder-32B-Instruct",
    temperature=0.7,
    max_tokens=1000,
    token=hf_token,
    provider="auto"
)

In [16]:
query_engine = index.as_query_engine(
    llm=llm,
    response_mode="tree_summarize",
)

In [18]:
response = query_engine.query("Hello!")
response

Response(response='Hello! How can I assist you today based on the provided information?', source_nodes=[NodeWithScore(node=TextNode(id_='94b2caa9-bf7a-474b-8c6c-34d6641ef012', embedding=None, metadata={'page_label': '10', 'file_name': 'fake_news_detection_spanish_DL.pdf', 'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf', 'file_type': 'application/pdf', 'file_size': 247472, 'creation_date': '2025-09-08', 'last_modified_date': '2025-09-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a2db6b71-67ad-45d1-ba9f-d797826c194b', node_type='4', metadata={'page_label': '10', 'file_name': 'fake_news_detection_spanish_DL.pdf', 'file_path': 'c:\\Users\

In [37]:
query = "What is the main contribution of Kevin Martinez to NLP?"
response = query_engine.query(query)
response

Response(response="Kevin Martinez's main contribution to NLP, as detailed in the provided information, involves developing and experimenting with various models for detecting fake news in Spanish. His work includes preprocessing text data through normalization, stop-word removal, stemming, tokenization, and padding. Additionally, he compared different text representation techniques such as Bag of Words, TF-IDF, and word embeddings. Martinez also implemented and evaluated both classical machine learning models (SVM, Random Forest, Gradient Boosting Tree, and MLP) and deep learning models (LSTM-RNN and CNN) for this task. His research includes using both trainable and fixed embedding layers, including a Transfer Learning approach with pre-trained GloVe embeddings. The experiments were conducted on Spanish datasets and also involved validating models with a translated English dataset.", source_nodes=[NodeWithScore(node=TextNode(id_='c7da3fe9-da44-4066-9869-ee57e9b1a4aa', embedding=None, m

In [38]:
response.response

"Kevin Martinez's main contribution to NLP, as detailed in the provided information, involves developing and experimenting with various models for detecting fake news in Spanish. His work includes preprocessing text data through normalization, stop-word removal, stemming, tokenization, and padding. Additionally, he compared different text representation techniques such as Bag of Words, TF-IDF, and word embeddings. Martinez also implemented and evaluated both classical machine learning models (SVM, Random Forest, Gradient Boosting Tree, and MLP) and deep learning models (LSTM-RNN and CNN) for this task. His research includes using both trainable and fixed embedding layers, including a Transfer Learning approach with pre-trained GloVe embeddings. The experiments were conducted on Spanish datasets and also involved validating models with a translated English dataset."

In [39]:
response.metadata

{'c7da3fe9-da44-4066-9869-ee57e9b1a4aa': {'page_label': '3',
  'file_name': 'fake_news_detection_spanish_DL.pdf',
  'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf',
  'file_type': 'application/pdf',
  'file_size': 247472,
  'creation_date': '2025-09-08',
  'last_modified_date': '2025-09-08'},
 'a6e03e5f-90b9-4634-8c3f-228020d9629e': {'page_label': '6',
  'file_name': 'fake_news_detection_spanish_DL.pdf',
  'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf',
  'file_type': 'application/pdf',
  'file_size': 247472,
  'creation_date': '2025-09-08',
  'last_modified_date': '2025-09-08'}}

In [40]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='c7da3fe9-da44-4066-9869-ee57e9b1a4aa', embedding=None, metadata={'page_label': '3', 'file_name': 'fake_news_detection_spanish_DL.pdf', 'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf', 'file_type': 'application/pdf', 'file_size': 247472, 'creation_date': '2025-09-08', 'last_modified_date': '2025-09-08'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b51881e0-a7b5-40eb-be92-7d99bde1a4e7', node_type='4', metadata={'page_label': '3', 'file_name': 'fake_news_detection_spanish_DL.pdf', 'file_path': 'c:\\Users\\kevin\\Documents\\Agents_MCP_dojo\\llamaindex\\data\\fake_news_detection_spanish_DL.pdf', 'file_type': '

#### Evaluation

In [27]:
faithfulness_evaluator = FaithfulnessEvaluator(llm=llm)
relevancy_evaluator = AnswerRelevancyEvaluator(llm=llm)

In [30]:
faithfulness_eval = faithfulness_evaluator.evaluate_response(response=response)
faithfulness_eval

EvaluationResult(query=None, contexts=['3 Methods\n3.1 Preprocessing steps\nIn order to obtain consistent results, a data standardization process known as Text Normalization was performed, which,\nin addition to eliminating non-alphanumeric characters in the text, includes some of the most commonly used techniques\nin NLP:\n• Stop Words:we removed words there is an agreement they do not contribute to the models learning process in\nthe context of the problem addressed; for instance, articles and prepositions.\n• Stemming: this technique was used to reduce words to their root.\n• Tokenization and Padding:as usual in text processing tasks, we performed tokenization and padding, when\nrequired, for words and sentences representation.\nSubsequently, we decided to compare some of the most common techniques regarding text representation: BoW, which\nprovides the number of occurrences of each word in the text corpus; term frequency-inverse document frequency(tf-idf),\nwhich provides a weighte

In [35]:
faithfulness_eval.passing, faithfulness_eval.score, faithfulness_eval.contexts

(True,
 1.0,
 ['3 Methods\n3.1 Preprocessing steps\nIn order to obtain consistent results, a data standardization process known as Text Normalization was performed, which,\nin addition to eliminating non-alphanumeric characters in the text, includes some of the most commonly used techniques\nin NLP:\n• Stop Words:we removed words there is an agreement they do not contribute to the models learning process in\nthe context of the problem addressed; for instance, articles and prepositions.\n• Stemming: this technique was used to reduce words to their root.\n• Tokenization and Padding:as usual in text processing tasks, we performed tokenization and padding, when\nrequired, for words and sentences representation.\nSubsequently, we decided to compare some of the most common techniques regarding text representation: BoW, which\nprovides the number of occurrences of each word in the text corpus; term frequency-inverse document frequency(tf-idf),\nwhich provides a weighted measure of the importa

In [45]:
relevancy_eval = relevancy_evaluator.evaluate_response(query=query, response=response)
relevancy_eval

EvaluationResult(query='What is the main contribution of Kevin Martinez to NLP?', contexts=None, response="Kevin Martinez's main contribution to NLP, as detailed in the provided information, involves developing and experimenting with various models for detecting fake news in Spanish. His work includes preprocessing text data through normalization, stop-word removal, stemming, tokenization, and padding. Additionally, he compared different text representation techniques such as Bag of Words, TF-IDF, and word embeddings. Martinez also implemented and evaluated both classical machine learning models (SVM, Random Forest, Gradient Boosting Tree, and MLP) and deep learning models (LSTM-RNN and CNN) for this task. His research includes using both trainable and fixed embedding layers, including a Transfer Learning approach with pre-trained GloVe embeddings. The experiments were conducted on Spanish datasets and also involved validating models with a translated English dataset.", passing=None, f

In [46]:
relevancy_eval.score, relevancy_eval.feedback

(1.0,
 "1. Does the provided response match the subject matter of the user's query?\n   - The response does match the subject matter of the user's query. It specifically focuses on Kevin Martinez's contributions to Natural Language Processing (NLP) by detailing his work on detecting fake news in Spanish, which is a relevant and specific area within NLP.\n\n2. Does the provided response attempt to address the focus or perspective on the subject matter taken on by the user's query?\n   - The response attempts to address the focus of the user's query by providing detailed information about Kevin Martinez's research methods, the models he used, the datasets he experimented with, and the techniques he applied. This gives a comprehensive view of his contributions to the field of NLP, specifically in the context of fake news detection.\n\nFeedback: The response is detailed and relevant, providing specific information about Kevin Martinez's contributions to NLP, particularly in the area of fak