# About Project

**This project about Starbucks Beverage Components is forked from [original notebook](https://www.kaggle.com/code/ahmedredaahmedali/starbucks-visualisation) providedd by [@ahmedredaahmedali](https://www.kaggle.com/ahmedredaahmedali)**

The current version is tailored for Exploratory Data Analysis and includes a full listing of beverages at the end.



**Dataset**

**[The Starbucks](https://www.kaggle.com/datasets/henryshan/starbucks/)** dataset contains nutritional information for 242 listed beverages.

This dataset serves as a comprehensive guide to the nutritional content of Starbucks beverages, making it a valuable resource for researchers, dietitians, and health-conscious consumers.

# 1. Libraries and Parameters

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots

from warnings import simplefilter
simplefilter("ignore")

colors = ["#8c0404","#f25ed0","#000000","#16A085","#34495E",
           "#21618C ","#512E5F","#45B39D","#AAB7B8 ","#20B2AA",
           "#FF69B4","#00CED1","#FF7F50","#7FFF00","#DA70D6"]


# Note: A few additional libraries are imported dynamically throughout the process as needed.

# 2. Dataset 

## Importing Dataset

In [None]:
df = pd.read_csv("/kaggle/input/starbucks/starbucks.csv")

## Dataset Structure

In [None]:
# Data structure
df.info()

In [None]:
# Shape Of Dataset
print(f"Number of Columns: {df.shape[1]}")
print(f"Number of Rows: {df.shape[0]}")

In [None]:
df.head(5)

In [None]:
df.describe(exclude = np.number)

In [None]:
# Display unique values per column
for col in df.columns:
    unique_vals = df[col].unique()
    print(f"Column: {col}")
    print(f"Unique values ({len(unique_vals)}): {unique_vals[:10]}")  # Limit to first 10 values
    print("-" * 50)

## Data Preprocessing

In [None]:
#remove the leading and tailing whitespaces in column labels
cols = df.columns
df.columns = df.columns.str.strip()
df.dtypes

In [None]:
# drop the rows with "varies", "Varies", or NaN in the Caffeine column
df = df.dropna(subset=['Caffeine (mg)'])
df = df[~df['Caffeine (mg)'].isin(['Varies', 'varies'])]

In [None]:
#convert columns from object to int64
df['Total Fat (g)'] = pd.to_numeric(df['Total Fat (g)'], errors='coerce')
df['Caffeine (mg)'] = pd.to_numeric(df['Caffeine (mg)'], errors='coerce')

df.dtypes

In [None]:
df.head(5)

In [None]:
# print all names of features
print(list(df.columns))

# 4. Data Visualisation

## Beverage Categories

In [None]:
#check for unique values
df['Beverage_category'].unique()

In [None]:
#add title
plt.title('Beverage catergories count')
#plot the count of each beverage cate
sns.countplot(data=df, y='Beverage_category')
plt.show()

## Average Nutritional Content In Each Beverages Category

In [None]:
import textwrap

# Nutrients to visualize
nutrients = ['Calories', 'Total Fat (g)', 'Sugars (g)', 'Protein (g)']

# Calculate average nutrient values by beverage category
average_nutrients = df.groupby('Beverage_category')[nutrients].mean().reset_index()
average_nutrients['Beverage_category'] = average_nutrients['Beverage_category'].astype('category')

# Sort and wrap category labels
sorted_categories = sorted(average_nutrients['Beverage_category'].unique())
wrapped_labels = [textwrap.fill(label, width=12) for label in sorted_categories]

# Create 4 vertically stacked subplots with increased height
fig, axes = plt.subplots(len(nutrients), 1, figsize=(10, 10))
axes = axes.flatten()

for i, nutrient in enumerate(nutrients):
    sns.barplot(
        data=average_nutrients,
        x='Beverage_category',
        y=nutrient,
        ax=axes[i],
        order=sorted_categories,
        palette='coolwarm'
    )
    axes[i].set_title(f'Average {nutrient} per Beverage Category', fontsize=12)
    axes[i].set_ylabel(nutrient)

    # Only show x-axis labels on the last plot
    if i == len(nutrients) - 1:
        axes[i].set_xticklabels(wrapped_labels, rotation=45, ha='right')
        axes[i].set_xlabel('Beverage Category')
    else:
        axes[i].set_xticklabels([])
        axes[i].set_xlabel('')

    # Add value labels to bars
    for container in axes[i].containers:
        axes[i].bar_label(container, fmt='%.1f', label_type='edge', fontsize=8, padding=3)

# Adjust spacing
plt.tight_layout()
plt.show()


## Pair Plot of Nutritional Metrics:

**Calorie, Total Fat, Sugar, & Protein**

In [None]:
#list columns
nutri_col = ['Calories', 'Total Fat (g)', 'Sugars (g)', 'Protein (g)', 'Beverage_category']
#make a copy of the dataset with the list
correl_df = df[nutri_col].copy()

#plot the corre_df using pairplot
sns.pairplot(data=correl_df, corner=True, hue='Beverage_category', height=2)
plt.show()

## Dietary Fiber Content of Each Beverage Category

In [None]:
dietary_fib = df.groupby('Beverage_category')[['Dietary Fibre (g)']]
plt.xticks(rotation=45, ha='right')
sns.boxplot(data=df, x='Beverage_category', y='Dietary Fibre (g)')

## Scatter Plot for Total Carbohydrates vs. Sugars

In [None]:
#plot the the Total Carbohydrates and Sugars in scatterplot
scatterplot = sns.scatterplot(data=df, x='Total Carbohydrates (g)', y='Sugars (g)', hue='Beverage_category')
#modify the legend to be placed outside the plot
scatterplot.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Beverage Category")

plt.show()

## Nutritional Contribution (% DV and mg) Across Beverage Types

In [None]:
# Columns with % DV values stored as strings
vitamin_cols = ['Vitamin A (% DV)', 'Vitamin C (% DV)', 'Calcium (% DV)', 'Iron (% DV)']

# Remove '%' and convert to float
for col in vitamin_cols:
    df[col] = df[col].str.replace('%', '', regex=False)  # Remove '%' symbol
    df[col] = pd.to_numeric(df[col], errors='coerce')    # Convert to float, NaNs if invalid

In [None]:
vitamin_cols = vitamin_cols + ['Sodium (mg)', 'Caffeine (mg)']

# Group by Beverage Category and calculate mean
avg_vitamins = df.groupby('Beverage_category')[vitamin_cols].mean().round(1)

# Display as table
print("✅ Average values (% Daily Value or mg) by Beverage Category:\n")

print(avg_vitamins)

# 5. Full List of Beverages

In [None]:
from IPython.display import display, Markdown

# Sort DataFrame by Beverage Category alphabetically
df_sorted = df.sort_values('Beverage_category')

# Group and display Beverage info by category
for category, group in df_sorted.groupby('Beverage_category'):
    # print(f"\n📁 Beverage Category: {category}")
    display(Markdown(f"**Beverage Category:** **{category}**"))        # print Bold using Markdown
    
    for _, row in group.iterrows():
        beverage = row['Beverage']
        prep = row['Beverage_prep']
        caffeine = row['Caffeine (mg)']
        print(f"• {beverage} — {prep} (Caffeine:{caffeine} mg)")


# Credits

**This project about Starbucks Beverage Components is forked from [original notebook](https://www.kaggle.com/code/ahmedredaahmedali/starbucks-visualisation) provided by [@ahmedredaahmedali](https://www.kaggle.com/ahmedredaahmedali)**

**If you find this notebook helpful, feel free to fork it OR give it an upvote!**