# Question 1
Create a comprehensive revenue trend analysis showing yearly revenue growth from 2015-2025. Include percentage growth rates, trend lines, and highlight key growth periods with annotations.
# Question 2
Analyze seasonal patterns in sales data. Create monthly sales heatmaps and identify peak selling months. Compare seasonal trends across different years and categories.
# Question 3
Build a customer segmentation analysis using RFM (Recency, Frequency, Monetary) methodology. Create scatter plots and segment customers into meaningful groups with actionable insights.
# Question 4
Visualize the evolution of payment methods from 2015-2025. Show the rise of UPI, decline of COD, and create stacked area charts to demonstrate market share changes over time.
# Question 5
Perform category-wise performance analysis. Create treemaps, bar charts, and pie charts showing revenue contribution, growth rates, and market share for each product category.
# Question 6
Analyze Prime membership impact on customer behavior. Compare average order values, order frequency, and category preferences between Prime and non-Prime customers using multiple visualization types.
# Question 7
Create geographic analysis of sales performance across Indian cities and states. Build choropleth maps and bar charts showing revenue density and growth patterns by tier (Metro/Tier1/Tier2/Rural).
# Question 8
Study festival sales impact using before/during/after analysis. Visualize revenue spikes during Diwali, Prime Day, and other festivals with detailed time series analysis.
# Question 9
Analyze customer age group behavior and preferences. Create demographic analysis with category preferences, spending patterns, and shopping frequency across different age segments.
# Question 10
Build price vs demand analysis using scatter plots and correlation matrices. Analyze how pricing strategies affect sales volumes across different categories and customer segments.
# Question 11
Create delivery performance analysis showing delivery days distribution, on-time performance, and customer satisfaction correlation with delivery speed across different cities and customer tiers.
# Question 12
Analyze return patterns and customer satisfaction using return rates, reasons, and correlation with product ratings, prices, and categories through multiple visualization techniques.
# Question 13
Study brand performance and market share evolution. Create brand comparison charts, market share trends, and competitive positioning analysis across different categories.
# Question 14
Build customer lifetime value (CLV) analysis using cohort analysis, retention curves, and CLV distribution across different customer segments and acquisition years.
# Question 15
Analyze discount and promotional effectiveness. Create discount impact analysis showing correlation between discount percentages, sales volumes, and revenue across categories and time periods.
# Question 16
Study product rating patterns and their impact on sales. Analyze rating distributions, correlation with sales performance, and identify patterns across categories and price ranges.
# Question 17
Create customer journey analysis showing purchase frequency patterns, category transitions, and customer evolution from first purchase to loyal customers using flow diagrams and transition matrices.
# Question 18
Analyze inventory and product lifecycle patterns. Study product launch success, decline phases, and category evolution over the decade with detailed trend analysis.
# Question 19
Build competitive pricing analysis comparing brand positioning, price ranges, and market penetration strategies across different product categories using box plots and competitive matrices.
# Question 20
Create a comprehensive business health dashboard combining key metrics like revenue growth, customer acquisition, retention rates, and operational efficiency using multi-panel visualizations with executive summary insights.


In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import squarify
from sklearn.cluster import KMeans
from datetime import datetime

# ----------------------
# Helper Function: Load Local CSVs
# ----------------------
# def load_amazon_data(data_dir):
#     import os, glob, pandas as pd

#     pattern = os.path.join(data_dir, "amazon_india_20*.csv")
#     all_files = sorted(glob.glob(pattern))
#     df_list = []

#     for f in all_files:
#         try:
#             df = pd.read_csv(f)
#             df_list.append(df)
#         except Exception as e:
#             print(f"Error reading {f}: {e}")

#     if not df_list:
#         raise FileNotFoundError(f"No amazon_india_20xx.csv files found in {data_dir}")

#     data = pd.concat(df_list, ignore_index=True)
#     data.columns = [c.strip().lower() for c in data.columns]

#     # Ensure order_date is datetime
#     data["order_date"] = pd.to_datetime(data["order_date"], errors="coerce")

#     # Revenue column
#     if "final_amount_inr" in data.columns:
#         data.rename(columns={"final_amount_inr": "revenue"}, inplace=True)
#     else:
#         raise KeyError("'final_amount_inr' column missing. Available columns: "
#                        f"{list(data.columns)}")

#     # Ensure order_id column exists
#     if "transaction_id" in data.columns:
#         data.rename(columns={"transaction_id": "order_id"}, inplace=True)
#     else:
#         raise KeyError("'transaction_id' column missing. Cannot derive order_id.")

#     return data

def load_amazon_data(data_dir):
    import os, glob, pandas as pd

    pattern = os.path.join(data_dir, "amazon_india_20*.csv")
    all_files = sorted(glob.glob(pattern))
    df_list = []

    for f in all_files:
        try:
            df = pd.read_csv(f)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {f}: {e}")

    if not df_list:
        raise FileNotFoundError(f"No amazon_india_20xx.csv files found in {data_dir}")

    data = pd.concat(df_list, ignore_index=True)
    data.columns = [c.strip().lower() for c in data.columns]

    # Convert order_date
    data["order_date"] = pd.to_datetime(data["order_date"], errors="coerce")

    # Revenue column
    if "final_amount_inr" in data.columns:
        data.rename(columns={"final_amount_inr": "revenue"}, inplace=True)
    else:
        raise KeyError("'final_amount_inr' column missing. Available columns: "
                       f"{list(data.columns)}")

    # Ensure order_id column exists
    if "transaction_id" in data.columns:
        data.rename(columns={"transaction_id": "order_id"}, inplace=True)
    else:
        raise KeyError("'transaction_id' column missing. Cannot derive order_id.")

    # Normalize prime membership column
    if "is_prime_member" in data.columns:
        data.rename(columns={"is_prime_member": "prime_member"}, inplace=True)
        # Convert to boolean
        data["prime_member"] = data["prime_member"].astype(bool)
    else:
        # If column missing, create default (all False)
        data["prime_member"] = False

    return data


# ----------------------
# Question 1: Revenue Trend Analysis
# ----------------------
def revenue_trend_analysis(df):
    df['year'] = df['order_date'].dt.year
    yearly_revenue = df.groupby('year')['revenue'].sum().reset_index()
    yearly_revenue['growth_%'] = yearly_revenue['revenue'].pct_change() * 100
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='year', y='revenue', data=yearly_revenue, marker='o')
    for _, row in yearly_revenue.iterrows():
        if not pd.isna(row['growth_%']):
            plt.text(row['year'], row['revenue'], f"{row['growth_%']:.1f}%", ha='center', va='bottom')
    plt.title("Yearly Revenue Trend (2015-2025)")
    plt.ylabel("Revenue")
    plt.xlabel("Year")
    plt.tight_layout()
    os.makedirs("outputs", exist_ok=True)
    plt.savefig("outputs/yearly_revenue_trend.png")
    plt.close()
    yearly_revenue.to_csv("outputs/yearly_revenue_trend.csv", index=False)

# ----------------------
# Question 2: Seasonal Pattern Analysis
# ----------------------
def seasonal_pattern_analysis(df):
    df['year'] = df['order_date'].dt.year
    df['month'] = df['order_date'].dt.month
    monthly_sales = df.groupby(['year', 'month'])['revenue'].sum().reset_index()
    pivot_table = monthly_sales.pivot(index='month', columns='year', values='revenue')
    plt.figure(figsize=(12, 6))
    sns.heatmap(pivot_table, cmap='YlGnBu')
    plt.title("Monthly Sales Heatmap")
    plt.savefig("outputs/monthly_sales_heatmap.png")
    plt.close()

# ----------------------
# Question 3: Customer Segmentation (RFM)
# ----------------------
def rfm_customer_segmentation(df):
    # Drop rows with missing critical values
    df = df.dropna(subset=["customer_id", "order_id", "order_date", "revenue"])

    snapshot_date = df['order_date'].max() + pd.Timedelta(days=1)

    rfm = df.groupby('customer_id').agg({
        'order_date': lambda x: (snapshot_date - x.max()).days,
        'order_id': 'nunique',
        'revenue': 'sum'
    }).reset_index()

    rfm.rename(columns={
        'order_date': 'recency',
        'order_id': 'frequency',
        'revenue': 'monetary'
    }, inplace=True)

    # Remove rows with missing or zero values
    rfm = rfm.fillna(0)
    rfm = rfm[(rfm['monetary'] > 0) & (rfm['frequency'] > 0)]

    # Normalize data before clustering
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    rfm_scaled = scaler.fit_transform(rfm[['recency', 'frequency', 'monetary']])

    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
    rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

    # Visualization
    import plotly.express as px
    fig = px.scatter_3d(
        rfm, x='recency', y='frequency', z='monetary',
        color='Cluster', title="RFM Customer Segmentation"
    )
    fig.write_html("outputs/rfm_customer_segmentation.html")


# ----------------------
# Question 4: Payment Method Evolution
# ----------------------
def payment_method_evolution(df):
    df['year'] = df['order_date'].dt.year
    payment_trend = df.groupby(['year', 'payment_method'])['order_id'].count().reset_index()
    pivot = payment_trend.pivot(index='year', columns='payment_method', values='order_id').fillna(0)
    pivot.plot.area(figsize=(12,6))
    plt.title("Payment Method Market Share (2015-2025)")
    plt.ylabel("Number of Orders")
    plt.savefig("outputs/payment_method_evolution.png")
    plt.close()

# ----------------------
# Question 5: Category-wise Performance
# ----------------------
def category_performance(df):
    cat_rev = df.groupby('category')['revenue'].sum().reset_index()
    plt.figure(figsize=(12, 6))
    squarify.plot(sizes=cat_rev['revenue'], label=cat_rev['category'], alpha=0.8)
    plt.axis('off')
    plt.title("Category Revenue Contribution Treemap")
    plt.savefig("outputs/category_treemap.png")
    plt.close()
    cat_rev.sort_values('revenue').plot(kind='barh', x='category', y='revenue', figsize=(10,6))
    plt.title("Category Revenue Bar Chart")
    plt.savefig("outputs/category_bar_chart.png")
    plt.close()

# ----------------------
# Question 6: Prime Membership Impact
# ----------------------
def prime_impact(df):
    prime_stats = df.groupby('prime_member').agg({'revenue':'mean','order_id':'nunique'}).reset_index()
    prime_stats.to_csv("outputs/prime_impact.csv", index=False)
    sns.barplot(x='prime_member', y='revenue', data=prime_stats)
    plt.title("Average Order Value: Prime vs Non-Prime")
    plt.savefig("outputs/prime_vs_nonprime_aov.png")
    plt.close()

# ----------------------
# Question 7: Geographic Analysis
# ----------------------
def geographic_analysis(df):
    city_rev = df.groupby('customer_city')['revenue'].sum().reset_index()
    top_cities = city_rev.sort_values('revenue', ascending=False).head(20)
    plt.figure(figsize=(12,6))
    sns.barplot(y='customer_city', x='revenue', data=top_cities)
    plt.title("Top 20 Cities by Revenue")
    plt.savefig("outputs/top_cities_revenue.png")
    plt.close()

# ----------------------
# Question 8: Festival Impact
# ----------------------
def festival_impact(df, festival_dates):
    for fest, dates in festival_dates.items():
        mask = (df['order_date'] >= dates['start']) & (df['order_date'] <= dates['end'])
        fest_df = df.loc[mask]
        daily = fest_df.groupby(df['order_date'].dt.date)['revenue'].sum()
        daily.plot(figsize=(10,4), marker='o')
        plt.title(f"Revenue Trend During {fest}")
        plt.savefig(f"outputs/{fest}_trend.png")
        plt.close()

# ----------------------
# Question 9: Age Group Behavior
# ----------------------
def age_group_behavior(df):
    age_rev = df.groupby('customer_age_group')['revenue'].sum().reset_index()
    sns.barplot(x='customer_age_group', y='revenue', data=age_rev)
    plt.title("Revenue by Age Group")
    plt.savefig("outputs/age_group_revenue.png")
    plt.close()

# ----------------------
# Question 10: Price vs Demand
# ----------------------
def price_vs_demand(df):
    plt.figure(figsize=(8,6))
    plt.scatter(df['subtotal_inr'], df['quantity'], alpha=0.5)
    plt.xlabel("Price")
    plt.ylabel("Quantity")
    plt.title("Price vs Demand")
    plt.savefig("outputs/price_vs_demand.png")
    plt.close()

# ----------------------
# Question 11: Delivery Performance
# ----------------------
def delivery_performance(df):
    sns.histplot(df['delivery_days'], bins=20, kde=True)
    plt.title("Delivery Days Distribution")
    plt.savefig("outputs/delivery_days_distribution.png")
    plt.close()

# ----------------------
# Question 12: Return Patterns
# ----------------------
# def return_patterns(df):
#     return_rate = df['return_status'].mean() * 100
#     print(f"Overall Return Rate: {return_rate:.2f}%")
#     sns.barplot(x='category', y='return_status', data=df)
#     plt.xticks(rotation=90)
#     plt.title("Return Rate by Category")
#     plt.savefig("outputs/return_rate_by_category.png")
#     plt.close()

def return_patterns(df):
    if 'return_status' not in df.columns:
        print("Warning: 'return_status' column not found. Skipping return analysis.")
        return
    
    # Convert to numeric: Yes -> 1, No -> 0
    df['return_status_numeric'] = df['return_status'].map({'Yes':1, 'No':0})
    
    if df['return_status_numeric'].isna().all():
        print("Warning: 'return_status' column has no recognizable values.")
        return
    
    return_rate = df['return_status_numeric'].mean() * 100
    print(f"Overall Return Rate: {return_rate:.2f}%")
    
    if 'category' in df.columns:
        sns.barplot(x='category', y='return_status_numeric', data=df)
        plt.xticks(rotation=90)
        plt.title("Return Rate by Category")
        plt.savefig("outputs/return_rate_by_category.png")
        plt.close()


# ----------------------
# Question 13: Brand Performance
# ----------------------
def brand_performance(df):
    brand_share = df.groupby('brand')['revenue'].sum().reset_index()
    brand_share.sort_values('revenue', ascending=False).head(15).plot(kind='bar', x='brand', y='revenue')
    plt.title("Top 15 Brands by Revenue")
    plt.savefig("outputs/top_brands.png")
    plt.close()

# ----------------------
# Question 14: CLV Analysis
# ----------------------
def clv_analysis(df):
    cohort = df.groupby(['customer_id', df['order_date'].dt.to_period('M')])['revenue'].sum().reset_index()
    cohort['cohort'] = cohort.groupby('customer_id')['order_date'].transform('min')
    cohort.to_csv("outputs/clv_cohort.csv", index=False)

# ----------------------
# Question 15: Discount Effectiveness
# ----------------------
def discount_effectiveness(df):
    sns.scatterplot(x='discount_percent', y='revenue', data=df)
    plt.title("Discount % vs Revenue")
    plt.savefig("outputs/discount_vs_revenue.png")
    plt.close()

# ----------------------
# Question 16: Product Ratings Impact
# ----------------------
def rating_impact(df):
    sns.boxplot(x='customer_rating', y='revenue', data=df)
    plt.title("Revenue by Product Rating")
    plt.savefig("outputs/rating_vs_revenue.png")
    plt.close()

# ----------------------
# Question 17: Customer Journey
# ----------------------
def customer_journey(df):
    journey = df.groupby(['customer_id', 'category'])['order_id'].count().unstack(fill_value=0)
    journey.to_csv("outputs/customer_journey_matrix.csv")

# ----------------------
# Question 18: Inventory Lifecycle
# ----------------------
def inventory_lifecycle(df):
    prod_life = df.groupby(['product_id'])['order_date'].agg(['min','max']).reset_index()
    prod_life['lifecycle_days'] = (prod_life['max'] - prod_life['min']).dt.days
    prod_life.to_csv("outputs/product_lifecycle.csv", index=False)

# ----------------------
# Question 19: Competitive Pricing
# ----------------------
def competitive_pricing(df):
    sns.boxplot(x='brand', y='subtotal_inr', data=df)
    plt.xticks(rotation=90)
    plt.title("Brand Price Comparison")
    plt.savefig("outputs/brand_price_comparison.png")
    plt.close()

# ----------------------
# Question 20: Business Health Dashboard
# ----------------------
def business_health_dashboard(df):
    metrics = {
        'total_revenue': df['revenue'].sum(),
        'unique_customers': df['customer_id'].nunique(),
        'total_orders': df['order_id'].nunique()
    }
    pd.DataFrame([metrics]).to_csv("outputs/business_health_summary.csv", index=False)

# ----------------------
# Main Execution
# ----------------------
if __name__ == "__main__":
    data_dir = "../dataset"  # update as needed
    data = load_amazon_data(data_dir)
    revenue_trend_analysis(data)
    seasonal_pattern_analysis(data)
    rfm_customer_segmentation(data)
    payment_method_evolution(data)
    category_performance(data)
    prime_impact(data)
    geographic_analysis(data)
    festival_impact(data, {"Diwali": {"start": "2025-10-15", "end": "2025-11-15"}})
    age_group_behavior(data)
    price_vs_demand(data)
    delivery_performance(data)
    return_patterns(data)
    brand_performance(data)
    clv_analysis(data)
    discount_effectiveness(data)
    rating_impact(data)
    customer_journey(data)
    inventory_lifecycle(data)
    competitive_pricing(data)
    business_health_dashboard(data)

In [14]:
data.columns

Index(['order_id', 'order_date', 'customer_id', 'product_id', 'product_name',
       'category', 'subcategory', 'brand', 'original_price_inr',
       'discount_percent', 'discounted_price_inr', 'quantity', 'subtotal_inr',
       'delivery_charges', 'revenue', 'customer_city', 'customer_state',
       'customer_tier', 'customer_spending_tier', 'customer_age_group',
       'payment_method', 'delivery_days', 'delivery_type', 'prime_member',
       'is_festival_sale', 'festival_name', 'customer_rating', 'return_status',
       'order_month', 'order_year', 'order_quarter', 'product_weight_kg',
       'is_prime_eligible', 'product_rating', 'year', 'month'],
      dtype='object')