# Exploratory Data Analysis (EDA)

This notebook focuses on exploring the cleaned financial data to identify patterns, trends, and relationships between variables.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Set plot styling
plt.style.use('seaborn-whitegrid')
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

# Import custom modules
from data_loader import load_raw_data
from eda_utils import generate_summary_statistics, plot_distribution, plot_correlation_matrix, plot_boxplot, plot_time_series

# Create results directory if it doesn't exist
os.makedirs('../results/plots', exist_ok=True)

## 1. Load the Cleaned Data

In [None]:
# Load the cleaned data
cleaned_data_path = '../data/processed/cleaned_data.csv'
df = pd.read_csv(cleaned_data_path)

# Display basic information
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print("\nData Types:")
print(df.dtypes)

# Display first few rows
df.head()

## 2. Summary Statistics

Let's compute summary statistics for the key financial metrics: Revenue, Cost, and Profit.

In [None]:
# Calculate summary statistics for financial metrics
financial_cols = ['Revenue', 'Cost', 'Profit']
financial_summary = df[financial_cols].describe()

# Display the summary statistics
print("Summary Statistics for Financial Metrics:")
financial_summary

In [None]:
# Calculate additional statistics
for col in financial_cols:
    print(f"\n{col} Statistics:")
    print(f"Median: ${df[col].median():,.2f}")
    print(f"Mean: ${df[col].mean():,.2f}")
    print(f"Standard Deviation: ${df[col].std():,.2f}")
    print(f"Minimum: ${df[col].min():,.2f}")
    print(f"Maximum: ${df[col].max():,.2f}")
    print(f"Range: ${df[col].max() - df[col].min():,.2f}")
    print(f"Interquartile Range (IQR): ${df[col].quantile(0.75) - df[col].quantile(0.25):,.2f}")
    print(f"Skewness: {df[col].skew():,.2f}")
    print(f"Kurtosis: {df[col].kurtosis():,.2f}")

### Calculate Return on Assets (ROA)

ROA is typically calculated as Net Income / Total Assets. Since we don't have Total Assets in our dataset, we'll create a proxy ROA using Profit as Net Income and a combination of other available metrics.

In [None]:
# Calculate a proxy for ROA (Return on Assets)
# Since we don't have Total Assets, we'll use Cost as a proxy for assets employed
df['ROA'] = df['Profit'] / df['Cost']

# Handle potential division by zero or infinity
df['ROA'] = df['ROA'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display ROA summary statistics
print("ROA (Return on Assets) Summary Statistics:")
df['ROA'].describe()

## 3. Distribution Analysis

Let's visualize the distributions of our key financial metrics: Revenue, Cost, Profit, and ROA.

In [None]:
# Plot distributions for financial metrics
for col in ['Revenue', 'Cost', 'Profit', 'ROA']:
    plt.figure(figsize=(12, 6))
    
    # Create subplot with 1 row and 2 columns
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
    plt.axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.2f}')
    plt.legend()
    
    # Add boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')
    
    plt.tight_layout()
    plt.savefig(f'../results/plots/{col}_distribution.png')
    plt.show()

In [None]:
# Check for outliers using IQR method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Identify outliers for each financial metric
for col in ['Revenue', 'Cost', 'Profit', 'ROA']:
    outliers, lower_bound, upper_bound = identify_outliers(df, col)
    print(f"\nOutliers in {col}:")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {len(outliers) / len(df) * 100:.2f}%")
    print(f"Lower bound: {lower_bound:.2f}")
    print(f"Upper bound: {upper_bound:.2f}")
    
    if len(outliers) > 0:
        print("\nSample of outliers:")
        print(outliers.head(5))

## 4. Sector/Region Trends

Let's analyze trends across different segments and countries.

In [None]:
# Group by Segment and calculate average metrics
segment_analysis = df.groupby('Segment').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Segment': 'count'
}).rename(columns={'Segment': 'Count'}).sort_values('Profit', ascending=False)

print("Segment Analysis:")
segment_analysis

In [None]:
# Visualize segment performance
plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by segment
plt.subplot(1, 2, 1)
sns.barplot(x=segment_analysis.index, y='Profit', data=segment_analysis)
plt.title('Average Profit by Segment')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by segment
plt.subplot(1, 2, 2)
sns.barplot(x=segment_analysis.index, y='ROA', data=segment_analysis)
plt.title('Average ROA by Segment')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/segment_performance.png')
plt.show()

In [None]:
# Group by Country and calculate average metrics
country_analysis = df.groupby('Country').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Country': 'count'
}).rename(columns={'Country': 'Count'}).sort_values('Profit', ascending=False)

print("Country Analysis:")
country_analysis.head(10)  # Show top 10 countries by profit

In [None]:
# Visualize country performance (top 10 countries by profit)
top_countries = country_analysis.head(10)

plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by country
plt.subplot(1, 2, 1)
sns.barplot(x=top_countries.index, y='Profit', data=top_countries)
plt.title('Average Profit by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by country
plt.subplot(1, 2, 2)
sns.barplot(x=top_countries.index, y='ROA', data=top_countries)
plt.title('Average ROA by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/country_performance.png')
plt.show()

In [None]:
# Cross-tabulation of Segment and Country
# Let's see which segments are most common in which countries
segment_country_crosstab = pd.crosstab(df['Segment'], df['Country'])

# Display the cross-tabulation
print("Segment-Country Cross-tabulation:")
segment_country_crosstab.head()

In [None]:
# Visualize the segment-country relationship with a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(segment_country_crosstab.iloc[:, :10], annot=True, cmap='viridis', fmt='d')
plt.title('Segment-Country Distribution (Top 10 Countries)')
plt.tight_layout()
plt.savefig('../results/plots/segment_country_heatmap.png')
plt.show()

In [None]:
# Group by Product and calculate average metrics
product_analysis = df.groupby('Product').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Product': 'count'
}).rename(columns={'Product': 'Count'}).sort_values('Profit', ascending=False)

print("Product Analysis:")
product_analysis.head(10)  # Show top 10 products by profit

In [None]:
# Visualize product performance (top 10 products by profit)
top_products = product_analysis.head(10)

plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by product
plt.subplot(1, 2, 1)
sns.barplot(x=top_products.index, y='Profit', data=top_products)
plt.title('Average Profit by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by product
plt.subplot(1, 2, 2)
sns.barplot(x=top_products.index, y='ROA', data=top_products)
plt.title('Average ROA by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/product_performance.png')
plt.show()

## 5. Correlation Analysis

Let's analyze the correlations between our numeric variables.

In [None]:
# Select numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

# Calculate correlation matrix
correlation_matrix = df[numeric_cols].corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
correlation_matrix

In [None]:
# Visualize the correlation matrix with a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.savefig('../results/plots/correlation_matrix.png')
plt.show()

In [None]:
# Identify highly correlated pairs
def get_highly_correlated_pairs(corr_matrix, threshold=0.5):
    # Get the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find pairs with correlation greater than threshold
    high_corr_pairs = []
    for col in upper.columns:
        for idx in upper.index:
            if abs(upper.loc[idx, col]) > threshold:
                high_corr_pairs.append((idx, col, upper.loc[idx, col]))
    
    # Sort by absolute correlation value
    high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
    return high_corr_pairs

# Get highly correlated pairs
high_corr_pairs = get_highly_correlated_pairs(correlation_matrix, threshold=0.5)

# Display the highly correlated pairs
print("Highly Correlated Pairs (|correlation| > 0.5):")
for var1, var2, corr in high_corr_pairs:
    print(f"{var1} and {var2}: {corr:.2f}")

In [None]:
# Visualize the relationships between highly correlated pairs
for var1, var2, corr in high_corr_pairs[:5]:  # Plot top 5 correlated pairs
    plt.figure(figsize=(12, 6))
    
    # Create scatter plot
    plt.subplot(1, 2, 1)
    sns.scatterplot(x=df[var1], y=df[var2])
    plt.title(f'Scatter Plot: {var1} vs {var2} (corr={corr:.2f})')
    plt.xlabel(var1)
    plt.ylabel(var2)
    
    # Add regression line
    plt.subplot(1, 2, 2)
    sns.regplot(x=df[var1], y=df[var2])
    plt.title(f'Regression Plot: {var1} vs {var2} (corr={corr:.2f})')
    plt.xlabel(var1)
    plt.ylabel(var2)
    
    plt.tight_layout()
    plt.savefig(f'../results/plots/correlation_{var1}_{var2}.png')
    plt.show()

## 6. Time Series Analysis

Let's analyze how our financial metrics change over time.

In [None]:
# Check if we have a date column
date_columns = [col for col in df.columns if 'date' in col.lower()]
print(f"Date columns found: {date_columns}")

# If we have a date column, convert it to datetime
if date_columns:
    date_col = date_columns[0]
    df[date_col] = pd.to_datetime(df[date_col])
    
    # Group by month and calculate average metrics
    df['Month'] = df[date_col].dt.to_period('M')
    monthly_data = df.groupby('Month').agg({
        'Revenue': 'mean',
        'Cost': 'mean',
        'Profit': 'mean',
        'ROA': 'mean'
    })
    
    # Convert Period index to datetime for plotting
    monthly_data.index = monthly_data.index.to_timestamp()
    
    # Plot time series
    plt.figure(figsize=(14, 10))
    
    # Revenue over time
    plt.subplot(2, 2, 1)
    plt.plot(monthly_data.index, monthly_data['Revenue'])
    plt.title('Average Monthly Revenue')
    plt.xlabel('Date')
    plt.ylabel('Revenue ($)')
    plt.grid(True)
    
    # Cost over time
    plt.subplot(2, 2, 2)
    plt.plot(monthly_data.index, monthly_data['Cost'])
    plt.title('Average Monthly Cost')
    plt.xlabel('Date')
    plt.ylabel('Cost ($)')
    plt.grid(True)
    
    # Profit over time
    plt.subplot(2, 2, 3)
    plt.plot(monthly_data.index, monthly_data['Profit'])
    plt.title('Average Monthly Profit')
    plt.xlabel('Date')
    plt.ylabel('Profit ($)')
    plt.grid(True)
    
    # ROA over time
    plt.subplot(2, 2, 4)
    plt.plot(monthly_data.index, monthly_data['ROA'])
    plt.title('Average Monthly ROA')
    plt.xlabel('Date')
    plt.ylabel('ROA')
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('../results/plots/time_series_analysis.png')
    plt.show()
    
    # Seasonal analysis
    df['Year'] = df[date_col].dt.year
    df['Month_Num'] = df[date_col].dt.month
    
    # Group by month number and calculate average metrics
    seasonal_data = df.groupby('Month_Num').agg({
        'Revenue': 'mean',
        'Cost': 'mean',
        'Profit': 'mean',
        'ROA': 'mean'
    })
    
    # Plot seasonal patterns
    plt.figure(figsize=(14, 10))
    
    # Revenue by month
    plt.subplot(2, 2, 1)
    plt.plot(seasonal_data.index, seasonal_data['Revenue'], marker='o')
    plt.title('Average Revenue by Month')
    plt.xlabel('Month')
    plt.ylabel('Revenue ($)')
    plt.xticks(range(1, 13))
    plt.grid(True)
    
    # Cost by month
    plt.subplot(2, 2, 2)
    plt.plot(seasonal_data.index, seasonal_data['Cost'], marker='o')
    plt.title('Average Cost by Month')
    plt.xlabel('Month')
    plt.ylabel('Cost ($)')
    plt.xticks(range(1, 13))
    plt.grid(True)
    
    # Profit by month
    plt.subplot(2, 2, 3)
    plt.plot(seasonal_data.index, seasonal_data['Profit'], marker='o')
    plt.title('Average Profit by Month')
    plt.xlabel('Month')
    plt.ylabel('Profit ($)')
    plt.xticks(range(1, 13))
    plt.grid(True)
    
    # ROA by month
    plt.subplot(2, 2, 4)
    plt.plot(seasonal_data.index, seasonal_data['ROA'], marker='o')
    plt.title('Average ROA by Month')
    plt.xlabel('Month')
    plt.ylabel('ROA')
    plt.xticks(range(1, 13))
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig('../results/plots/seasonal_analysis.png')
    plt.show()
else:
    print("No date column found for time series analysis.")

## 7. Segment-Product Analysis

Let's analyze the relationship between segments and products.

In [None]:
# Cross-tabulation of Segment and Product
segment_product_crosstab = pd.crosstab(df['Segment'], df['Product'])

# Display the cross-tabulation
print("Segment-Product Cross-tabulation:")
segment_product_crosstab.head()

In [None]:
# Visualize the segment-product relationship with a heatmap
# If there are many products, select the top ones by frequency
top_products = df['Product'].value_counts().head(10).index.tolist()
segment_top_products = segment_product_crosstab[top_products]

plt.figure(figsize=(16, 10))
sns.heatmap(segment_top_products, annot=True, cmap='viridis', fmt='d')
plt.title('Segment-Product Distribution (Top 10 Products)')
plt.tight_layout()
plt.savefig('../results/plots/segment_product_heatmap.png')
plt.show()

In [None]:
# Calculate average profit by segment and product
segment_product_profit = df.pivot_table(
    values='Profit',
    index='Segment',
    columns='Product',
    aggfunc='mean'
)

# Select top products for visualization
segment_product_profit_top = segment_product_profit[top_products]

# Visualize average profit by segment and product
plt.figure(figsize=(16, 10))
sns.heatmap(segment_product_profit_top, annot=True, cmap='RdYlGn', fmt='.2f')
plt.title('Average Profit by Segment and Product')
plt.tight_layout()
plt.savefig('../results/plots/segment_product_profit.png')
plt.show()

## 8. Profit Margin Analysis

Let's calculate and analyze profit margins across different dimensions.

In [None]:
# Calculate profit margin
df['Profit_Margin'] = df['Profit'] / df['Revenue']

# Handle potential division by zero or infinity
df['Profit_Margin'] = df['Profit_Margin'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display profit margin summary statistics
print("Profit Margin Summary Statistics:")
df['Profit_Margin'].describe()

In [None]:
# Visualize profit margin distribution
plt.figure(figsize=(12, 6))

# Create subplot with 1 row and 2 columns
plt.subplot(1, 2, 1)
sns.histplot(df['Profit_Margin'], kde=True)
plt.title('Distribution of Profit Margin')
plt.axvline(df['Profit_Margin'].mean(), color='red', linestyle='--', label=f'Mean: {df["Profit_Margin"].mean():.2f}')
plt.axvline(df['Profit_Margin'].median(), color='green', linestyle='--', label=f'Median: {df["Profit_Margin"].median():.2f}')
plt.legend()

# Add boxplot
plt.subplot(1, 2, 2)
sns.boxplot(y=df['Profit_Margin'])
plt.title('Boxplot of Profit Margin')

plt.tight_layout()
plt.savefig('../results/plots/profit_margin_distribution.png')
plt.show()

In [None]:
# Analyze profit margin by segment
segment_margin = df.groupby('Segment')['Profit_Margin'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=segment_margin.index, y=segment_margin.values)
plt.title('Average Profit Margin by Segment')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_segment.png')
plt.show()

In [None]:
# Analyze profit margin by product (top 10 products)
product_margin = df.groupby('Product')['Profit_Margin'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=product_margin.index, y=product_margin.values)
plt.title('Average Profit Margin by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_product.png')
plt.show()

In [None]:
# Analyze profit margin by country (top 10 countries)
country_margin = df.groupby('Country')['Profit_Margin'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=country_margin.index, y=country_margin.values)
plt.title('Average Profit Margin by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_country.png')
plt.show()

## 9. Revenue-Cost-Profit Relationship

Let's analyze the relationship between Revenue, Cost, and Profit.

In [None]:
# Create a scatter plot of Revenue vs. Cost, colored by Profit
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['Revenue'], df['Cost'], c=df['Profit'], cmap='RdYlGn', alpha=0.7)
plt.colorbar(scatter, label='Profit ($)')
plt.title('Revenue vs. Cost (colored by Profit)')
plt.xlabel('Revenue ($)')
plt.ylabel('Cost ($)')
plt.grid(True)

# Add a diagonal line representing Revenue = Cost (i.e., Profit = 0)
max_val = max(df['Revenue'].max(), df['Cost'].max())
plt.plot([0, max_val], [0, max_val], 'k--', label='Revenue = Cost (Profit = 0)')
plt.legend()

plt.tight_layout()
plt.savefig('../results/plots/revenue_cost_profit_relationship.png')
plt.show()

In [None]:
# Create a 3D scatter plot of Revenue, Cost, and Profit
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(12, 10))
ax = fig.add_subplot(111, projection='3d')

# Sample data points (if dataset is large)
sample_size = min(1000, len(df))
sample_df = df.sample(sample_size, random_state=42)

# Create scatter plot
scatter = ax.scatter(
    sample_df['Revenue'],
    sample_df['Cost'],
    sample_df['Profit'],
    c=sample_df['Profit_Margin'],
    cmap='RdYlGn',
    alpha=0.7
)

# Add labels and title
ax.set_xlabel('Revenue ($)')
ax.set_ylabel('Cost ($)')
ax.set_zlabel('Profit ($)')
ax.set_title('3D Relationship: Revenue, Cost, and Profit')

# Add color bar
cbar = fig.colorbar(scatter, ax=ax, label='Profit Margin')

plt.tight_layout()
plt.savefig('../results/plots/revenue_cost_profit_3d.png')
plt.show()

## 10. Key Insights and Summary

Let's summarize the key insights from our exploratory data analysis.

### Summary Statistics
- The average Revenue is $XXX with a standard deviation of $XXX.
- The average Cost is $XXX with a standard deviation of $XXX.
- The average Profit is $XXX with a standard deviation of $XXX.
- The average ROA is XXX with a standard deviation of XXX.
- The average Profit Margin is XXX with a standard deviation of XXX.

### Distribution Analysis
- Revenue, Cost, and Profit distributions are right-skewed, indicating that a small number of high-value transactions contribute significantly to the overall financial performance.
- ROA distribution shows XXX pattern, suggesting XXX.
- Profit Margin distribution reveals XXX, indicating XXX.

### Sector/Region Trends
- The XXX segment generates the highest average profit, followed by XXX and XXX.
- XXX has the highest ROA, suggesting efficient use of assets in this segment.
- The top-performing countries in terms of profit are XXX, XXX, and XXX.
- XXX products are most profitable, with an average profit of $XXX.

### Correlation Analysis
- Revenue and Cost show a strong positive correlation (XXX), indicating that higher revenue is associated with higher costs.
- Revenue and Profit are also strongly correlated (XXX), suggesting that increasing revenue generally leads to higher profits.
- ROA and Profit Margin have a XXX correlation (XXX), indicating XXX.

### Time Series Analysis
- Financial performance shows XXX trend over time, with XXX periods showing stronger performance.
- Seasonal patterns indicate that XXX months typically have higher revenue and profit.
- The ROA fluctuates throughout the year, with peaks in XXX and troughs in XXX.

### Segment-Product Analysis
- XXX products are most popular in the XXX segment.
- The XXX segment has the highest profit margin for XXX products.
- The combination of XXX segment and XXX product yields the highest average profit.

### Profit Margin Analysis
- The overall average profit margin is XXX%.
- XXX segment has the highest profit margin at XXX%.
- XXX products have the highest profit margin at XXX%.
- XXX countries show the highest profit margins, suggesting more favorable market conditions or pricing strategies in these regions.

### Revenue-Cost-Profit Relationship
- The relationship between Revenue and Cost is XXX, with most data points falling XXX the break-even line.
- Higher Revenue generally leads to higher Profit, but the relationship is not perfectly linear, indicating variations in cost structures across different segments, products, or regions.

### Recommendations
- Focus on expanding the XXX segment in XXX countries, as they show the highest profit margins.
- Consider optimizing the cost structure for XXX products to improve profitability.
- Investigate the factors contributing to the high ROA in XXX segment to potentially apply similar strategies in other segments.
- Develop targeted marketing strategies for XXX products in XXX segment, as this combination yields the highest average profit.
- Address the seasonal fluctuations by implementing dynamic pricing strategies during low-performance months.
- Investigate outliers in the Revenue and Cost distributions to understand exceptional cases and potential opportunities or risks.
- Consider portfolio diversification by increasing focus on high-margin products across different segments.
- Develop region-specific strategies based on the varying profit margins across countries.

## 11. Additional Analysis: Revenue and Profit Quartiles

Let's segment our data into quartiles based on Revenue and Profit to better understand the distribution of financial performance.

In [None]:
# Create Revenue quartiles
df['Revenue_Quartile'] = pd.qcut(df['Revenue'], 4, labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4 (High)'])

# Create Profit quartiles
df['Profit_Quartile'] = pd.qcut(df['Profit'], 4, labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4 (High)'])

# Display the count of records in each quartile
print("Revenue Quartile Distribution:")
print(df['Revenue_Quartile'].value_counts())

print("\nProfit Quartile Distribution:")
print(df['Profit_Quartile'].value_counts())

In [None]:
# Analyze average metrics by Revenue quartile
revenue_quartile_analysis = df.groupby('Revenue_Quartile').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'Profit_Margin': 'mean',
    'ROA': 'mean'
})

print("Analysis by Revenue Quartile:")
revenue_quartile_analysis

In [None]:
# Visualize metrics by Revenue quartile
plt.figure(figsize=(14, 10))

# Profit by Revenue quartile
plt.subplot(2, 2, 1)
sns.barplot(x='Revenue_Quartile', y='Profit', data=df)
plt.title('Average Profit by Revenue Quartile')
plt.ylabel('Profit ($)')
plt.grid(axis='y')

# Profit Margin by Revenue quartile
plt.subplot(2, 2, 2)
sns.barplot(x='Revenue_Quartile', y='Profit_Margin', data=df)
plt.title('Average Profit Margin by Revenue Quartile')
plt.ylabel('Profit Margin')
plt.grid(axis='y')

# ROA by Revenue quartile
plt.subplot(2, 2, 3)
sns.barplot(x='Revenue_Quartile', y='ROA', data=df)
plt.title('Average ROA by Revenue Quartile')
plt.ylabel('ROA')
plt.grid(axis='y')

# Cost by Revenue quartile
plt.subplot(2, 2, 4)
sns.barplot(x='Revenue_Quartile', y='Cost', data=df)
plt.title('Average Cost by Revenue Quartile')
plt.ylabel('Cost ($)')
plt.grid(axis='y')

plt.tight_layout()
plt.savefig('../results/plots/metrics_by_revenue_quartile.png')
plt.show()

In [None]:
# Analyze segment distribution across Revenue quartiles
segment_revenue_quartile = pd.crosstab(df['Segment'], df['Revenue_Quartile'], normalize='index') * 100

# Display the percentage distribution
print("Segment Distribution Across Revenue Quartiles (%):\n")
segment_revenue_quartile

In [None]:
# Visualize segment distribution across Revenue quartiles
plt.figure(figsize=(14, 8))
segment_revenue_quartile.plot(kind='bar', stacked=True)
plt.title('Segment Distribution Across Revenue Quartiles')
plt.xlabel('Segment')
plt.ylabel('Percentage (%)')
plt.legend(title='Revenue Quartile')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/segment_revenue_quartile_distribution.png')
plt.show()

In [None]:
# Create a cross-tabulation of Revenue and Profit quartiles
revenue_profit_quartile = pd.crosstab(df['Revenue_Quartile'], df['Profit_Quartile'])

# Display the cross-tabulation
print("Revenue-Profit Quartile Cross-tabulation:\n")
revenue_profit_quartile

In [None]:
# Visualize the Revenue-Profit quartile relationship with a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(revenue_profit_quartile, annot=True, cmap='viridis', fmt='d')
plt.title('Revenue-Profit Quartile Relationship')
plt.tight_layout()
plt.savefig('../results/plots/revenue_profit_quartile_heatmap.png')
plt.show()

## 12. Export Key Findings for Reporting

Let's export some key findings and visualizations for use in reports.

In [None]:
# Create a summary DataFrame for key metrics by segment
segment_summary = df.groupby('Segment').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'Profit_Margin': 'mean',
    'ROA': 'mean',
    'Segment': 'count'
}).rename(columns={'Segment': 'Count'}).sort_values('Profit', ascending=False)

# Export to CSV
segment_summary.to_csv('../results/reports/segment_performance_summary.csv')

# Create a summary DataFrame for key metrics by country (top 20)
country_summary = df.groupby('Country').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'Profit_Margin': 'mean',
    'ROA': 'mean',
    'Country': 'count'
}).rename(columns={'Country': 'Count'}).sort_values('Profit', ascending=False).head(20)

# Export to CSV
country_summary.to_csv('../results/reports/country_performance_summary.csv')

# Create a summary DataFrame for key metrics by product (top 20)
product_summary = df.groupby('Product').agg({
    'Revenue': 'mean',
    'Cost': 'mean',
    'Profit': 'mean',
    'Profit_Margin': 'mean',
    'ROA': 'mean',
    'Product': 'count'
}).rename(columns={'Product': 'Count'}).sort_values('Profit', ascending=False).head(20)

# Export to CSV
product_summary.to_csv('../results/reports/product_performance_summary.csv')

print("Summary reports exported to the 'results/reports' directory.")

In [None]:
# Create a comprehensive correlation matrix and export it
correlation_matrix.to_csv('../results/reports/correlation_matrix.csv')

# Export the list of highly correlated pairs
high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Variable 1', 'Variable 2', 'Correlation'])
high_corr_df.to_csv('../results/reports/highly_correlated_pairs.csv', index=False)

print("Correlation analysis exported to the 'results/reports' directory.")

## 13. Conclusion

In this notebook, we performed a comprehensive exploratory data analysis of the financial dataset. We examined summary statistics, distributions, segment/region trends, correlations, and time series patterns. The analysis revealed several key insights about the financial performance across different segments, products, and countries.

Key findings include:
- The distribution of financial metrics (Revenue, Cost, Profit) is right-skewed, indicating a concentration of high-value transactions.
- There are strong correlations between Revenue, Cost, and Profit, with Revenue and Cost showing the strongest relationship.
- Segment performance varies significantly, with some segments showing higher profitability and efficiency (ROA) than others.
- Country and product analyses revealed specific high-performing markets and products that could be targeted for growth.
- The relationship between Revenue quartiles and Profit quartiles shows that high revenue doesn't always translate to high profit, highlighting the importance of cost management.

These insights can inform strategic decisions related to market focus, product development, pricing strategies, and resource allocation. The next steps would involve deeper statistical analysis, predictive modeling, and targeted recommendations based on these findings.