In [51]:
%load_ext autoreload
%autoreload 2

import qdrant_client
from utils import get_milelion_urls, extract_milelion
import glob

from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings, StorageContext, VectorStoreIndex, PromptTemplate, get_response_synthesizer
from llama_index.core.readers.json import JSONReader
from llama_index.llms.openai import OpenAI
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
# process the urls - take 
def get_milelion_urls_for_year(year: int) -> list[str]:
    all_urls = get_milelion_urls()
    year_urls = [url for url in all_urls if f"/{year}/" in url and not url.endswith((".jpg", ".png", ".webp", ".jpeg"))]
    return year_urls

urls_2024 = get_milelion_urls_for_year(2024)
print(len(urls_2024))
print(urls_2024)

Getting Sitemap for https://milelion.com/post-sitemap1.xml
Found 16007 links
594
['https://milelion.com/2024/10/17/qatar-airways-offering-up-to-60-bonus-on-avios-purchases/', 'https://milelion.com/2024/10/18/accor-plus-member-for-a-day-how-it-works/', 'https://milelion.com/2024/10/18/what-happens-to-my-credit-card-points-when-i-die/', 'https://milelion.com/2024/10/18/milelion-weekly-deal-summary-12-18-oct-24/', 'https://milelion.com/2024/07/11/world-of-hyatt-offering-2x-points-at-hyatt-place-and-hyatt-house/', 'https://milelion.com/2024/10/17/review-maybank-horizon-visa-signature-card/', 'https://milelion.com/2024/01/09/whats-the-best-credit-card-for-hospital-or-medical-bills/', 'https://milelion.com/2024/10/16/2024-edition-best-credit-cards-for-pharmacies/', 'https://milelion.com/2024/10/16/full-list-of-priority-pass-dragonpass-restaurants-at-changi-airport/', 'https://milelion.com/2024/10/10/dcs-flex-card-review-weird-flex-but-ok/', 'https://milelion.com/2024/10/15/krisflyer-spontane

In [25]:
# chunk the urls in batches of 10
def chunk_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

url_chunks = list(chunk_list(urls_2024, 10))

for i, chunk in enumerate(url_chunks):
    print(f"Chunk {i}: {chunk}")
    await extract_milelion(chunk)


Chunk 0: ['https://milelion.com/2024/10/17/qatar-airways-offering-up-to-60-bonus-on-avios-purchases/', 'https://milelion.com/2024/10/18/accor-plus-member-for-a-day-how-it-works/', 'https://milelion.com/2024/10/18/what-happens-to-my-credit-card-points-when-i-die/', 'https://milelion.com/2024/10/18/milelion-weekly-deal-summary-12-18-oct-24/', 'https://milelion.com/2024/07/11/world-of-hyatt-offering-2x-points-at-hyatt-place-and-hyatt-house/', 'https://milelion.com/2024/10/17/review-maybank-horizon-visa-signature-card/', 'https://milelion.com/2024/01/09/whats-the-best-credit-card-for-hospital-or-medical-bills/', 'https://milelion.com/2024/10/16/2024-edition-best-credit-cards-for-pharmacies/', 'https://milelion.com/2024/10/16/full-list-of-priority-pass-dragonpass-restaurants-at-changi-airport/', 'https://milelion.com/2024/10/10/dcs-flex-card-review-weird-flex-but-ok/']
[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🕸️ Crawling https://milelion.com/2

  Expected `PromptTokensDetails` but got `dict` with value `{'audio_tokens': None, 'cached_tokens': 0}` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


[LOG] Extracted 4 blocks from URL: https://milelion.com/2024/06/02/guide-how-to-do-a-oneworld-lounge-safari-at-changi-airport/ block index: 0
[LOG] 🚀 Extraction done for https://milelion.com/2024/06/02/guide-how-to-do-a-oneworld-lounge-safari-at-changi-airport/, time taken: 8.91 seconds.
[LOG] 🚀 Content extracted for https://milelion.com/2024/08/03/dragonpass-adds-7-new-restaurants-at-singapore-changi-airport/, success: True, time taken: 8.85 seconds
[LOG] 🔥 Extracting semantic blocks for https://milelion.com/2024/08/03/dragonpass-adds-7-new-restaurants-at-singapore-changi-airport/, Strategy: AsyncWebCrawler
[LOG] 🚀 Extraction done for https://milelion.com/2024/08/03/dragonpass-adds-7-new-restaurants-at-singapore-changi-airport/, time taken: 18.25 seconds.
[LOG] ✅ Crawled https://milelion.com/2024/03/03/the-uob-prvi-miles-mastercard-visas-airport-limo-benefit/ successfully!
[LOG] 🚀 Crawling done for https://milelion.com/2024/03/03/the-uob-prvi-miles-mastercard-visas-airport-limo-benefi

In [39]:
# Initialize Qdrant client
client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

# Create a collection
collection_name = "milelion_docs"
vectors_size = 768  # Dimensions of vectors to store in Qdrant
distance = qdrant_client.models.Distance.COSINE

client.recreate_collection(
    collection_name=collection_name,
    vectors_config=qdrant_client.models.VectorParams(size=vectors_size, distance=distance),
)

# Set up vector store
Settings.embed_model = FastEmbedEmbedding(model_name="BAAI/bge-base-en-v1.5")
Settings.llm = OpenAI()
vector_store = QdrantVectorStore(client=client, collection_name=collection_name)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Load JSON and create an index
def load_and_index_json(directory_path):
    reader = JSONReader(
        levels_back=0,
        collapse_length=None,
        ensure_ascii=False,
        is_jsonl=False,
        clean_json=True
    )

    # Find all JSON files in the specified directory
    json_files = glob.glob(os.path.join(directory_path, "*.json"))

    # Load the data from each JSON file
    documents = []
    for json_file in json_files:
        documents.extend(reader.load_data(input_file=json_file, extra_info={}))

    # Create an index for querying
    index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)
    return index

# Specify the directory containing your JSON files
json_directory = ".data/"

# Load JSON and create an index
index = load_and_index_json(json_directory)
print("Indexing completed!")

  client.recreate_collection(
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 73584.28it/s]


Indexing completed!


In [52]:
qa_prompt_tmpl = (
    "Context information is below.\n"
    "-------------------------------"
    "{context_str}\n"
    "-------------------------------"
    "Given the context information and not prior knowledge,"
    "answer the query. Please be concise, and complete.\n"
    "If the context does not contain an answer to the query,"
    "respond with \"I don't know!\"."
    "Query: {query_str}\n"
    "Answer: "
)
qa_prompt = PromptTemplate(qa_prompt_tmpl)

retriever = VectorIndexRetriever(index=index, similarity_top_k=5, sparse_top_k=12)
response_synthesizer = get_response_synthesizer(
    llm = OpenAI(),
    text_qa_template = qa_prompt
)
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)


In [58]:
from pprint import pprint

pprint(query_engine.query(
    """
    I want to buy a ticket to Tokyo. What are the best options for credit cards? Why?
    """
).response)

('The best options for credit cards to buy a ticket to Tokyo would be those '
 'that offer high miles per dollar (mpd) rewards for airline ticket purchases. '
 'Cards like UOB Lady’s Card, DBS WWMC, HSBC Revolution, and Maybank Visa '
 'Infinite are good choices. These cards can help you earn more miles for your '
 'ticket purchase, allowing you to accumulate rewards for future trips.')
