In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq


client = Groq(api_key="enter_key_api_here")




# <span style="color: aquamarine;">I. Text Retrieving</span>

### <span style="color: yellow;">Load Documents</span>

In [3]:
company_name = "delta"
chunks_path = f"data/{company_name}/chunks.csv"

document_store = pd.read_csv(chunks_path)
document_store.head()

Unnamed: 0,chunk_id,company,source_file,chunk_text,word_count
0,chunk_001,delta,annual_report_2024,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,600
1,chunk_002,delta,annual_report_2024,statements of the registrant included in the f...,600
2,chunk_003,delta,annual_report_2024,actual results to differ materially from histo...,600
3,chunk_004,delta,annual_report_2024,six continents. Our domestic network is center...,600
4,chunk_005,delta,annual_report_2024,"consumer brand, including: + Recognized as the...",600


In [4]:
document_store.info()
document_store[['chunk_id', 'company', 'source_file']].head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   chunk_id     159 non-null    object
 1   company      159 non-null    object
 2   source_file  159 non-null    object
 3   chunk_text   159 non-null    object
 4   word_count   159 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 6.3+ KB


Unnamed: 0,chunk_id,company,source_file
0,chunk_001,delta,annual_report_2024
1,chunk_002,delta,annual_report_2024
2,chunk_003,delta,annual_report_2024
3,chunk_004,delta,annual_report_2024
4,chunk_005,delta,annual_report_2024


### <span style="color: yellow;">Fit TF-IDF</span>

In [5]:
vectorizer = TfidfVectorizer(stop_words="english")
chunk_tfidf = vectorizer.fit_transform(document_store["chunk_text"])
chunk_tfidf.shape

(159, 6100)

### <span style="color: yellow;">Retrieval</span>

In [6]:
def retrieve_chunks(question, top_k=7):
    #Given a user question, return a DataFrame with the top_k most relevant chunks and their similarity scores
    q_vec = vectorizer.transform([question])
    sims = cosine_similarity(q_vec, chunk_tfidf).flatten()
    
    top_idx = np.argsort(sims)[::-1][:top_k]
    
    results = document_store.iloc[top_idx].copy()
    results["score"] = sims[top_idx]
    return results


### <span style="color: yellow;">Prompt Builder</span>

In [7]:
def build_prompt(question, retrieved):
    context = ""

    for _, row in retrieved.iterrows():
        context += (
            f"\n\n---\nChunk ID: {row['chunk_id']}\n"
            f"Source: {row['source_file']}\n"
            f"Text:\n{row['chunk_text']}\n"
        )

    return f"""
You are an AI assistant answering ONLY from provided chunks.
If not in chunks, say: 'The documents do not contain this information.'
Cite chunks using (chunk: chunk_XXXX).

QUESTION:
{question}

CONTEXT:
{context}
""".strip()


### <span style="color: yellow;">LLM QA Function</span>

In [8]:
def answer_question(question, top_k=5, model="llama-3.3-70b-versatile"):
    """
    1. Retrieve relevant chunks
    2. Build prompt with citation instructions
    3. Ask the LLM
    4. Return answer + retrieved chunks
    """
    retrieved = retrieve_chunks(question, top_k)
    prompt = build_prompt(question, retrieved)

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )

    answer = response.choices[0].message.content
    return answer, retrieved


# <span style="color: aquamarine;">II. Testing</span>

In [9]:
company_snap_questions = [
    "What is Delta Airlines?",
    "What are the main business segments?",
    "How does Delta generate revenue?"
    ]

risk_factors_questions = ["What are the main risks?"]

esg_questions = [
    "What environmental goals does the Delta describe?",
    "What sustainability targets does it mention?",
    ]

custom_questions = [
    "How is Delta's DEI strategy?",
    "What is the major holder breakdown?",
    "What are some awards that Delta achieves?"
    "What is Delta known for"
]

for_presentation = [
    "How is Delta's DEI strategy?",
    "What environmental goals does the Delta describe?"
]



### <span style="color: yellow;">Presentation Demo</span>

In [110]:
for q in for_presentation:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: How is Delta's DEI strategy?

ANSWER:
 Delta's DEI strategy is focused on three main areas: valuing diversity, pursuing equity, and promoting inclusion. The company aims to create a work environment where all employees feel valued, respected, and empowered to contribute their best work (chunk_128). Delta's approach to DEI includes initiatives such as expanding recruitment efforts, seeking qualified interview panelists from diverse backgrounds, and promoting leadership development programs to reflect the diversity of its workforce (chunk_128). The company also strives to create merit-based access to career opportunities by considering skills as equivalencies to education (chunk_128). Additionally, Delta partners with organizations to support underrepresented groups, such as the Partnership for Your Success program with the U.S. Army, to connect enlisting soldiers and Reserve Officers' Training Corps cadets to interviews and potential full-time jobs following their military ser

### <span style="color: yellow;">Company Snapshot</span>

In [107]:
for q in company_snap_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: What is Delta Airlines?

ANSWER:
 Delta Airlines is an airline company that operates in the global airline industry, providing passenger and cargo transportation services. (chunk_032)

Top Chunks:
      chunk_id         source_file     score
31  chunk_032  annual_report_2024  0.200086
12  chunk_013  annual_report_2024  0.197075
7   chunk_008  annual_report_2024  0.194218
13  chunk_014  annual_report_2024  0.187220
4   chunk_005  annual_report_2024  0.154742
QUESTION: What are the main business segments?

ANSWER:
 The main business segments are our airline segment and our refinery segment (chunk: chunk_100).

Top Chunks:
      chunk_id         source_file     score
99  chunk_100  annual_report_2024  0.071287
22  chunk_023  annual_report_2024  0.069085
4   chunk_005  annual_report_2024  0.066864
44  chunk_045  annual_report_2024  0.066633
30  chunk_031  annual_report_2024  0.057390
QUESTION: How does Delta generate revenue?

ANSWER:
 Delta generates revenue from the following s

### <span style="color: yellow;">Risks</span>

In [10]:
for q in risk_factors_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: What are the main risks?

ANSWER:
 The main risks are climate-related risks, which include physical risks and transition risks. (chunk_149)

Top Chunks:
       chunk_id         source_file     score
148  chunk_149     esg_report_2024  0.129929
147  chunk_148     esg_report_2024  0.110561
119  chunk_120     esg_report_2024  0.095101
44   chunk_045  annual_report_2024  0.077588
37   chunk_038  annual_report_2024  0.070753


### <span style="color: yellow;">ESG Goals</span>

In [109]:
for q in esg_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])
    

QUESTION: What environmental goals does the Delta describe?

ANSWER:
 The Delta describes the following environmental goals: 
- Achieving net-zero emissions by 2050 (chunk_140)
- Reducing greenhouse gas emissions (chunk_030)
- Decarbonizing airline operations and value chain (chunk_140)
- Transitioning the fleet to more fuel-efficient aircraft (chunk_140)
- Adopting new technologies and procedures to improve fuel efficiency (chunk_140)
- Collaborating to scale supply and reduce the cost of sustainable aviation fuel (SAF) (chunk_140)
- Achieving 100% net-zero supply chain by 2050 (chunk_140) 
(chunk_030, chunk_140, chunk_157, chunk_158)

Top Chunks:
       chunk_id         source_file     score
29   chunk_030  annual_report_2024  0.132841
156  chunk_157     esg_report_2024  0.121763
28   chunk_029  annual_report_2024  0.101059
157  chunk_158     esg_report_2024  0.083307
139  chunk_140     esg_report_2024  0.082636
QUESTION: What sustainability targets does it mention?

ANSWER:
 Delta's

### <span style="color: yellow;">Custom Questions</span>

In [97]:
for q in custom_questions:
    print("="*80)
    print("QUESTION:", q)
    answer, chunks = answer_question(q)
    print("\nANSWER:\n", answer)
    print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])

QUESTION: How is Delta's DEI strategy?

ANSWER:
 Delta's DEI strategy is focused on three main areas: valuing diversity, pursuing equity, and promoting inclusion. The company aims to create a work environment where all employees feel valued, respected, and empowered to contribute their best work. (chunk_128) 

Delta's approach to DEI includes efforts to reinforce its talent pipeline, expand recruitment efforts, and seek qualified interview panelists from diverse backgrounds. The company also strives to create merit-based access to career opportunities by considering skills as equivalencies to education. (chunk_128)

Additionally, Delta promotes inclusion by building on its culture of listening and engagement, fostering greater understanding and a sense of belonging for all employees. The company also partners with organizations to support underrepresented groups, such as women and minorities, in pursuing careers in aviation. (chunk_128)

Top Chunks:
       chunk_id      source_file    

In [None]:
question = ''
print("="*80)
print("QUESTION:", question)
answer, chunks = answer_question(question)
print("\nANSWER:\n", answer)
print("\nTop Chunks:\n", chunks[["chunk_id", "source_file", "score"]])