# Environment setup

In [1]:
# Install necessary libraries
!pip install -q openai langchain langchain-openai langchain-community openai-whisper sentence-transformers pdf2image
!apt-get install poppler-utils
!pip install --upgrade Pillow

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m25.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.5/383.5 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.7/49.7 kB[0m [31m4.9 MB/s[0

In [3]:
%cd /content/drive/MyDrive/GenAI/RAG/CAPSTONE PROJECT - MultiModal Starbucks Finance

/content/drive/MyDrive/GenAI/RAG/CAPSTONE PROJECT - MultiModal Starbucks Finance


In [4]:
from google.colab import userdata
api_key = userdata.get('genai_course')

In [5]:
# Import libraries
from langchain_openai import ChatOpenAI
from openai import OpenAI
from IPython.display import display, Markdown
from sentence_transformers import SentenceTransformer
import whisper
import pandas as pd
import base64
from pdf2image import convert_from_path
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import os
import torch

  from tqdm.autonotebook import tqdm, trange


# Audio Transcription

In [6]:
# Check if the GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Checking which models are available
whisper.available_models()

['tiny.en',
 'tiny',
 'base.en',
 'base',
 'small.en',
 'small',
 'medium.en',
 'medium',
 'large-v1',
 'large-v2',
 'large-v3',
 'large',
 'large-v3-turbo',
 'turbo']

In [10]:
# Transcribe the audio using Whisper
model = whisper.load_model(name = 'large-v3-turbo',
                           device = device)
input_file = "starbucks-q3.mp3"
result = model.transcribe(audio = input_file)

100%|█████████████████████████████████████| 1.51G/1.51G [00:24<00:00, 65.9MiB/s]
  checkpoint = torch.load(fp, map_location=device)


In [13]:
# Check the transcription output
transcription_text= result['text']
print("Transcription text:")
print(transcription_text)

Transcription text:
 2024. And with that, I'll now tell you the call over to Lakshman. Thank you, Tiffany, and thank you for joining us this afternoon. Let me start by laying out our results for this quarter. Our Q3 total company revenue was $9.1 billion, up 1% year-over-year, and 6% over Q2. Our global comparable store sales declined 3% year-over-year, driven by a negative 2% comp growth in North America and a negative 14% comp growth in China, and partially offset by strong performance in Japan. Our global operating margins contracted by 70 basis points to 16.7%, and overall earnings per share for the quarter was $0.93. Our total company results were in line with guidance, but international performance, particularly in China, was challenged. We are not satisfied with the results, but our actions are making an impact. Leading business and operational indicators are trending in the right direction ahead of our financial results, and our runway for improvement is long. We see green shoo

In [14]:
# Save the transcription to a text file
os.makedirs("transcript", exist_ok = True)
with open("transcript/transcript.txt", "w") as f:
  f.write(transcription_text)

# Embedding the Audio

In [22]:
# Loading the transcription text into chunks of 100 characters
with open("transcript/transcript.txt", "r") as f:
  text = f.read()

chunk_size = 250
audio_chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# Summary of the the chunks
print(f"Number of chunks: {len(audio_chunks)}")
print(f"First chunk: {audio_chunks[3]}")

Number of chunks: 58
First chunk: fied with the results, but our actions are making an impact. Leading business and operational indicators are trending in the right direction ahead of our financial results, and our runway for improvement is long. We see green shoots in our U.S. busin


In [23]:
# Load the Clip model and embed the chunks
clip_model = SentenceTransformer('clip-ViT-B-32', device = device)
audio_embeddings = clip_model.encode(audio_chunks)

In [25]:
# Check the shape of the embeddings
print(f"Shape of embeddings: {audio_embeddings.shape}")

Shape of embeddings: (58, 512)


# PDF to image transformation

In [27]:
# Defining the paths for the PDf and the images
pdf_path = "3Q24-Earnings-Release.pdf"
output_folder = "images"
os.makedirs(output_folder, exist_ok = True)

In [28]:
# Convert each page of PDF to images
images = convert_from_path(pdf_path)
image_paths = []

for i, image in enumerate(images):
  image_path = os.path.join(output_folder, f"page_{i+1}.jpg")
  image.save(image_path, "JPEG")
  image_paths.append(image_path)

print(f"Number of images/pages is {len(image_paths)}")
print(f"Examples of 3 image paths {image_paths[:3]}")

Number of images/pages is 17
Examples of 3 image paths ['images/page_1.jpg', 'images/page_2.jpg', 'images/page_3.jpg']


# Embedding the images

In [30]:
# Using the clip model to embed images
image_embeddings = []

for filename in os.listdir(output_folder):
  if filename.endswith(".jpg"):
    image_path = os.path.join(output_folder, filename)
    image = Image.open(image_path)
    embedding = clip_model.encode(image)
    image_embeddings.append(embedding)

In [33]:
# Check the image embeddings
print(f"Shape of image embeddings: {len(image_embeddings)}")
print(f"The shape of embeddings is {image_embeddings[0].shape}")

Shape of image embeddings: 17
The shape of embeddings is (512,)


# Retrieval system

In [35]:
# Defining a query
query = "what are the short term risks for the company"
query_embeddings = clip_model.encode(query)

In [40]:
# Compute the similarity with the transcription embeddings
audio_similarities = cosine_similarity([query_embeddings], audio_embeddings)[0]

In [44]:
# Order the top K (20) most similar audios
k = 20
top_k_audio_similarities = audio_similarities.argsort()[-k:][::-1]

In [48]:
# Show the most similar audios and their similariries
print(f"The top 5 most similar chunks are {top_k_audio_similarities[:5]}")
print(f"The corresponding similarities are {audio_similarities[top_k_audio_similarities[:5]]}")

The top 5 most similar chunks are [33 49 50 46 48]
The corresponding similarities are [0.8553849  0.84822136 0.84667623 0.8423403  0.8272289 ]


In [66]:
# Comput the cosine similarity with the image embeddings
image_similarities = cosine_similarity([query_embeddings], image_embeddings)[0]

In [68]:
# print the top k images in terms of similarities
k = 5
top_k_image_similarities = image_similarities.argsort()[-k:][::-1]

In [69]:
# Display the top 5 images
print(f"The top 5 most similar images are {top_k_image_similarities[:5]}")
print(f"The corresponding similarities are {image_similarities[top_k_image_similarities[:5]]}")

The top 5 most similar images are [14 16  1 13  6]
The corresponding similarities are [0.2644042  0.2597667  0.25643957 0.2551114  0.25334686]


# Prepare the context

In [71]:
# Combine the top k transcriptions
k = 5
text_context = ' '.join([audio_chunks[idx] for idx in top_k_audio_similarities[:k]])

In [73]:
# Retrieve the top k images and transform into base64
base64frame = []
for idx in top_k_image_similarities[:k]:
  image_path = image_paths[idx]
  with open(image_path, "rb") as image_file:
    base64frame.append(base64.b64encode(image_file.read()).decode('utf-8'))
base64frame

Output hidden; open in https://colab.research.google.com to view.

# Generative System

In [82]:
# Connecting to the openAI API
client = OpenAI(api_key = api_key)

In [97]:
# Define the system prompt
system_prompt = f"""
You are a financial advisor expert in publicly traded companies.
you must answer the {query}
You explain in clear terms with the data available only"""

In [98]:
# PRepare the list of images
image_data_list = [{"type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{img}",
                                  "detail": "high"}} for img in base64frame]
image_data_list

Output hidden; open in https://colab.research.google.com to view.

In [99]:
# Prepare the message content
user_message_content = [
    {"type": "text", "text": text_context},
    *image_data_list
]

In [100]:
# Generate the answer
response = client.chat.completions.create(
    model = "gpt-4o-mini",
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message_content}],
    temperature = 0.2)

In [101]:
# Display the generated answer
display(Markdown(response.choices[0].message.content))

Based on the provided financial data and context, here are some short-term risks for Starbucks Corporation:

1. **International Market Challenges**: The company is experiencing mixed performance internationally, with notable weaknesses in regions like the Middle East, Southeast Asia, and parts of Europe. Misperceptions about the brand and economic pressures on consumers in these areas could hinder growth.

2. **Declining Comparable Store Sales**: In the North America segment, comparable store sales decreased by 2%, and in the International segment, they declined by 7%. This trend could indicate weakening customer demand and may affect overall revenue growth.

3. **Increased Operating Expenses**: Operating income has decreased due to rising store operating expenses and investments in wages and benefits. This could pressure margins if revenue growth does not keep pace.

4. **Currency Fluctuations**: The company’s performance is affected by foreign currency exchange rates, which can lead to unfavorable impacts on revenues and profits, particularly in international markets.

5. **Competitive Landscape**: Increased competition in the coffee and beverage market may lead to pricing pressures and reduced market share, especially in key international markets like China.

6. **Operational Efficiency**: The contraction in operating margins suggests potential inefficiencies or increased costs that could impact profitability if not addressed.

7. **Consumer Spending Trends**: Economic conditions affecting consumer spending, particularly in Europe, could lead to reduced discretionary spending on premium products like those offered by Starbucks.

These risks could impact Starbucks' short-term financial performance and overall market position.