In [None]:
import os
import sys
from pathlib import Path
import re

import requests

import faiss
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

import ollama

current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
sys.path.append(parent_dir)

from src.utils import IATAProcessor
processor = IATAProcessor()


  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'src'

****
# Chunking without cleaning stopwords
****

In [None]:
# Redo chunking without cleaning stopwords
# data_dir = Path("../data")
# pdf_files = sorted(data_dir.glob("iata-annual-review-*.pdf"))

# all_chunks= []

# for pdf in pdf_files:
#     year_match = re.search(r"(\d{4})", pdf.stem)
#     if not year_match:
#         continue
#     year = int(year_match.group(1))

#     print(f"Processing {pdf.name} for year {year}")
#     raw_text = processor.extract_text_from_pdf(pdf)
#     chunks = processor.chunk_text(raw_text, chunk_size=5)
#     all_chunks.append(chunks)


Processing iata-annual-review-2005.pdf for year 2005
Processing iata-annual-review-2006.pdf for year 2006
Processing iata-annual-review-2007.pdf for year 2007
Processing iata-annual-review-2008.pdf for year 2008
Processing iata-annual-review-2009.pdf for year 2009
Processing iata-annual-review-2010.pdf for year 2010
Processing iata-annual-review-2011.pdf for year 2011
Processing iata-annual-review-2012.pdf for year 2012
Processing iata-annual-review-2013.pdf for year 2013
Processing iata-annual-review-2014.pdf for year 2014
Processing iata-annual-review-2015.pdf for year 2015
Processing iata-annual-review-2016.pdf for year 2016
Processing iata-annual-review-2017.pdf for year 2017
Processing iata-annual-review-2018.pdf for year 2018
Processing iata-annual-review-2019.pdf for year 2019
Processing iata-annual-review-2020.pdf for year 2020
Processing iata-annual-review-2021.pdf for year 2021
Processing iata-annual-review-2022.pdf for year 2022
Processing iata-annual-review-2023.pdf for yea

In [None]:
#Flatten the list
# from itertools import chain
# flat_chunks = list(chain.from_iterable(all_chunks))

In [None]:
# Path("../artifacts").mkdir(exist_ok=True)
# with open("../artifacts/raw_chunks.pkl", "wb") as f:
#     pickle.dump(flat_chunks, f)
# print("Saved yearly_topic_data to artifacts/raw_chunks.pkl")

Saved yearly_topic_data to artifacts/raw_chunks.pkl.pkl


In [None]:
chunk_file_path = os.path.join(parent_dir, "artifacts", "raw_chunks.pkl")
with open(chunk_file_path, "rb") as chunk_file:
    flat_chunks = pickle.load(chunk_file)

****
# Embedding chunks + FAISS index
****

In [24]:
embeddings = model.encode(flat_chunks)

In [None]:
# Build FAISS index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))  # Store in-memory vector DB

faiss.write_index(index, "../artifacts/chunk_faiss_index.faiss")

****
# Query function
****

In [32]:
def retrieve_similar_chunks(query, model, index, chunks, k=5):
    query_vec = model.encode([query])
    distances, indices = index.search(np.array(query_vec), k)
    return [chunks[i] for i in indices[0]]

****
# Use ollama for answer generation
****

In [None]:
def generate_answer_with_ollama(query, context_chunks, model="mistral"):
    context = "\n\n".join(context_chunks)
    prompt = f"""You are an aviation strategy analyst. Based on the following IATA report excerpts, answer the question:

Context:
{context}

Question: {query}

Answer:"""

    response = ollama.chat(model=model, messages=[
        {"role": "user", "content": prompt}
    ])
    
    return response["message"]["content"]


****
# Full query flow
****

In [41]:
ollama.pull('mistral')

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [43]:
# Load everything (if needed)
index = faiss.read_index("../artifacts/chunk_faiss_index.faiss")
with open("../artifacts/raw_chunks.pkl", "rb") as f:
    chunks = pickle.load(f)

# Ask a question
query = "What are the major sustainability initiatives"
top_chunks = retrieve_similar_chunks(query, model, index, chunks, k=5)
answer = generate_answer_with_ollama(query, top_chunks)

print("Generated Answer:\n", answer)

Generated Answer:
 1. Aviation Industry's Commitment to Carbon Neutral Growth (CNG): The aviation industry has committed to achieve carbon neutral growth from 2020, and already has a plan in place to achieve this goal by 2050.

2. Development of Sustainable Biofuels: Aviation is actively working towards the development of sustainable biofuels as an alternative to traditional fossil fuels. The industry is exploring three possible synthetic fuels derived from coal, natural gas, and biomass, with biomass offering the best emissions reductions.

3. Four-Pillar Emissions Strategy: This strategy, adopted by the entire aviation industry in 2008, focuses on reducing emissions through four key pillars: operational efficiency, infrastructure, market-based measures, and alternative jet fuels. The strategy is estimated to save $5 billion in fuel costs.

4. Infrastructure Improvements: Governments are urged to support essential infrastructure improvements that could help reduce aviation emissions. 