# Exploratory Data Analysis (EDA)

This notebook focuses on exploring the cleaned financial data to identify patterns, trends, and relationships between variables.

In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Set plot styling
try:
    plt.style.use('seaborn-v0_8-whitegrid')  # For newer matplotlib versions
except:
    try:
        plt.style.use('seaborn-whitegrid')  # For older versions
    except:
        print("Default style used as seaborn-whitegrid not available")
        
sns.set_palette('viridis')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Add the src directory to the path
sys.path.append(os.path.abspath('../src'))

# Import custom modules
from data_loader import load_raw_data
from eda_utils import generate_summary_statistics, plot_distribution, plot_correlation_matrix, plot_boxplot, plot_time_series

# Create results directory if it doesn't exist
os.makedirs('../results/plots', exist_ok=True)


## 1. Load the Cleaned Data

In [None]:
# Load the cleaned data
cleaned_data_path = '../data/processed/cleaned_data.csv'
df = pd.read_csv(cleaned_data_path)

# Strip spaces from column names
df.columns = df.columns.str.strip()

# Create a mapping for columns with spaces (if needed)
column_mapping = {}
for col in df.columns:
    if ' ' in col:
        new_col = col.replace(' ', '_')
        column_mapping[col] = new_col
df = df.rename(columns=column_mapping)

# Print the mapping for reference
if column_mapping:
    print("Columns renamed to replace spaces with underscores:")
    for old, new in column_mapping.items():
        print(f"  '{old}' → '{new}'")

# Clean financial columns (Sales, COGS, Profit) by removing $, commas, and spaces
financial_cols = ['Sales', 'COGS', 'Profit']
for col in financial_cols:
    # Convert to string and remove non-numeric characters (except decimals)
    df[col] = df[col].astype(str).str.replace(r'[^0-9.]', '', regex=True)
    
    # Replace empty strings with NaN
    df[col] = df[col].replace('', np.nan)
    
    # Convert to numeric, coercing invalid entries to NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN in financial columns (or fill as needed)
df = df.dropna(subset=financial_cols)

# Verify data types
print("\nData Types After Cleaning:")
print(df[financial_cols].dtypes)

# Print cleaned columns for verification
print("\nCleaned Financial Columns (First 5 Rows):")
print(df[financial_cols].head())

# Display basic information
print(f"\nDataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print("\nFull Data Types:")
print(df.dtypes)
df.head()

## 2. Summary Statistics

Let's compute summary statistics for the key financial metrics: Revenue, Cost, and Profit.

In [None]:
# Cell 3: Summary Statistics
# Calculate summary statistics for financial metrics
financial_cols = ['Sales', 'COGS', 'Profit']
financial_summary = df[financial_cols].describe()

# Display the summary statistics
print("Summary Statistics for Financial Metrics:")
financial_summary


In [None]:
# Cell 4: Additional Statistics
# Calculate additional statistics
for col in financial_cols:
    print(f"\n{col} Statistics:")
    print(f"Median: ${df[col].median():,.2f}")
    print(f"Mean: ${df[col].mean():,.2f}")
    print(f"Standard Deviation: ${df[col].std():,.2f}")
    print(f"Minimum: ${df[col].min():,.2f}")
    print(f"Maximum: ${df[col].max():,.2f}")
    print(f"Range: ${df[col].max() - df[col].min():,.2f}")
    print(f"Interquartile Range (IQR): ${df[col].quantile(0.75) - df[col].quantile(0.25):,.2f}")
    print(f"Skewness: {df[col].skew():,.2f}")
    print(f"Kurtosis: {df[col].kurtosis():,.2f}")


### Calculate Return on Assets (ROA)


In [None]:
# Cell 5: Calculate Return on Assets (ROA)
# Calculate ROA using COGS as a proxy for assets
df['ROA'] = df['Profit'] / df['COGS']

# Handle potential division by zero or infinity
df['ROA'] = df['ROA'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display ROA summary statistics
print("ROA (Return on Assets) Summary Statistics:")
df['ROA'].describe()


## 3. Distribution Analysis

Let's visualize the distributions of our key financial metrics: Sales, COGS, Profit, and ROA.

In [None]:
# Cell 6: Distribution Analysis
# Plot distributions for financial metrics
for col in ['Sales', 'COGS', 'Profit', 'ROA']:
    plt.figure(figsize=(12, 6))
    
    # Create subplot with 1 row and 2 columns
    plt.subplot(1, 2, 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.axvline(df[col].mean(), color='red', linestyle='--', label=f'Mean: {df[col].mean():.2f}')
    plt.axvline(df[col].median(), color='green', linestyle='--', label=f'Median: {df[col].median():.2f}')
    plt.legend()
    
    # Add boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[col])
    plt.title(f'Boxplot of {col}')
    
    plt.tight_layout()
    plt.savefig(f'../results/plots/{col}_distribution.png')
    plt.show()


In [None]:
# Cell 7: Check for outliers
# Check for outliers using IQR method
def identify_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Identify outliers for each financial metric
for col in ['Sales', 'COGS', 'Profit', 'ROA']:
    outliers, lower_bound, upper_bound = identify_outliers(df, col)
    print(f"\nOutliers in {col}:")
    print(f"Number of outliers: {len(outliers)}")
    print(f"Percentage of outliers: {len(outliers) / len(df) * 100:.2f}%")
    print(f"Lower bound: {lower_bound:.2f}")
    print(f"Upper bound: {upper_bound:.2f}")
    
    if len(outliers) > 0:
        print("\nSample of outliers:")
        print(outliers.head(5))


## 4. Sector/Region Trends

Let's analyze trends across different segments and countries.

In [None]:
# Cell 8: Segment Analysis
# Group by Segment and calculate average metrics
segment_analysis = df.groupby('Segment').agg({
    'Sales': 'mean',
    'COGS': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Segment': 'count'
}).rename(columns={'Segment': 'Count'}).sort_values('Profit', ascending=False)

print("Segment Analysis:")
segment_analysis


In [None]:
# Cell 9: Visualize segment performance
# Visualize segment performance
plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by segment
plt.subplot(1, 2, 1)
sns.barplot(x=segment_analysis.index, y='Profit', data=segment_analysis)
plt.title('Average Profit by Segment')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by segment
plt.subplot(1, 2, 2)
sns.barplot(x=segment_analysis.index, y='ROA', data=segment_analysis)
plt.title('Average ROA by Segment')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/segment_performance.png')
plt.show()


In [None]:
# Cell 10: Country Analysis
# Group by Country and calculate average metrics
country_analysis = df.groupby('Country').agg({
    'Sales': 'mean',
    'COGS': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Country': 'count'
}).rename(columns={'Country': 'Count'}).sort_values('Profit', ascending=False)

print("Country Analysis:")
country_analysis.head(10)  # Show top 10 countries by profit


In [None]:
# Cell 11: Visualize country performance
# Visualize country performance (top 10 countries by profit)
top_countries = country_analysis.head(10)

plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by country
plt.subplot(1, 2, 1)
sns.barplot(x=top_countries.index, y='Profit', data=top_countries)
plt.title('Average Profit by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by country
plt.subplot(1, 2, 2)
sns.barplot(x=top_countries.index, y='ROA', data=top_countries)
plt.title('Average ROA by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/country_performance.png')
plt.show()


In [None]:
# Cell 12: Cross-tabulation of Segment and Country
# Cross-tabulation of Segment and Country
# Let's see which segments are most common in which countries
segment_country_crosstab = pd.crosstab(df['Segment'], df['Country'])

# Display the cross-tabulation
print("Segment-Country Cross-tabulation:")
segment_country_crosstab.head()


In [None]:
# Cell 13: Visualize segment-country relationship
# Visualize the segment-country relationship with a heatmap
plt.figure(figsize=(16, 10))
sns.heatmap(segment_country_crosstab.iloc[:, :10], annot=True, cmap='viridis', fmt='d')
plt.title('Segment-Country Distribution (Top 10 Countries)')
plt.tight_layout()
plt.savefig('../results/plots/segment_country_heatmap.png')
plt.show()


In [None]:
# Cell 14: Product Analysis
# Group by Product and calculate average metrics
product_analysis = df.groupby('Product').agg({
    'Sales': 'mean',
    'COGS': 'mean',
    'Profit': 'mean',
    'ROA': 'mean',
    'Product': 'count'
}).rename(columns={'Product': 'Count'}).sort_values('Profit', ascending=False)

print("Product Analysis:")
product_analysis.head(10)  # Show top 10 products by profit


In [None]:
# Cell 15: Visualize product performance
# Visualize product performance (top 10 products by profit)
top_products = product_analysis.head(10)

plt.figure(figsize=(14, 8))

# Create a bar plot for average profit by product
plt.subplot(1, 2, 1)
sns.barplot(x=top_products.index, y='Profit', data=top_products)
plt.title('Average Profit by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average Profit ($)')

# Create a bar plot for ROA by product
plt.subplot(1, 2, 2)
sns.barplot(x=top_products.index, y='ROA', data=top_products)
plt.title('Average ROA by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Average ROA')

plt.tight_layout()
plt.savefig('../results/plots/product_performance.png')
plt.show()


## 5. Correlation Analysis

Let's analyze the correlations between our numeric variables.

In [None]:
# Cell 16: Correlation Analysis
# Select numeric columns for correlation analysis
numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
print(f"Numeric columns: {numeric_cols}")

# Calculate correlation matrix
correlation_matrix = df[numeric_cols].corr()

# Display the correlation matrix
print("\nCorrelation Matrix:")
correlation_matrix


In [None]:
# Cell 17: Visualize correlation matrix
# Visualize the correlation matrix with a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Variables')
plt.tight_layout()
plt.savefig('../results/plots/correlation_matrix.png')
plt.show()


In [None]:
# Cell 18: Identify highly correlated pairs
# Identify highly correlated pairs
def get_highly_correlated_pairs(corr_matrix, threshold=0.5):
    # Get the upper triangle of the correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find pairs with correlation greater than threshold
    high_corr_pairs = []
    for col in upper.columns:
        for idx in upper.index:
            if abs(upper.loc[idx, col]) > threshold:
                high_corr_pairs.append((idx, col, upper.loc[idx, col]))
    
    # Sort by absolute correlation value
    high_corr_pairs.sort(key=lambda x: abs(x[2]), reverse=True)
    return high_corr_pairs

# Get highly correlated pairs
high_corr_pairs = get_highly_correlated_pairs(correlation_matrix, threshold=0.5)

# Display the highly correlated pairs
print("Highly Correlated Pairs (|correlation| > 0.5):")
for var1, var2, corr in high_corr_pairs:
    print(f"{var1} and {var2}: {corr:.2f}")


In [None]:
# Cell 19: Visualize highly correlated pairs
# Visualize the relationships between highly correlated pairs
for var1, var2, corr in high_corr_pairs[:5]:  # Plot top 5 correlated pairs
    plt.figure(figsize=(12, 6))
    
    # Create scatter plot
    plt.subplot(1, 2, 1)
    sns.scatterplot(x=df[var1], y=df[var2])
    plt.title(f'Scatter Plot: {var1} vs {var2} (corr={corr:.2f})')
    plt.xlabel(var1)
    plt.ylabel(var2)
    
    # Add regression line
    plt.subplot(1, 2, 2)
    sns.regplot(x=df[var1], y=df[var2])
    plt.title(f'Regression Plot: {var1} vs {var2} (corr={corr:.2f})')
    plt.xlabel(var1)
    plt.ylabel(var2)
    
    plt.tight_layout()
    plt.savefig(f'../results/plots/correlation_{var1}_{var2}.png')
    plt.show()


## 7. Segment-Product Analysis

Let's analyze the relationship between segments and products.

In [None]:
# Cross-tabulation of Segment and Product
segment_product_crosstab = pd.crosstab(df['Segment'], df['Product'])

# Display the cross-tabulation
print("Segment-Product Cross-tabulation:")
segment_product_crosstab.head()

In [None]:
# Visualize the segment-product relationship with a heatmap
# If there are many products, select the top ones by frequency
top_products = df['Product'].value_counts().head(10).index.tolist()
segment_top_products = segment_product_crosstab[top_products]

plt.figure(figsize=(16, 10))
sns.heatmap(segment_top_products, annot=True, cmap='viridis', fmt='d')
plt.title('Segment-Product Distribution (Top 10 Products)')
plt.tight_layout()
plt.savefig('../results/plots/segment_product_heatmap.png')
plt.show()

In [None]:
# Calculate average profit by segment and product
segment_product_profit = df.pivot_table(
    values='Profit',
    index='Segment',
    columns='Product',
    aggfunc='mean'
)

# Select top products for visualization
segment_product_profit_top = segment_product_profit[top_products]

# Visualize average profit by segment and product
plt.figure(figsize=(16, 10))
sns.heatmap(segment_product_profit_top, annot=True, cmap='RdYlGn', fmt='.2f')
plt.title('Average Profit by Segment and Product')
plt.tight_layout()
plt.savefig('../results/plots/segment_product_profit.png')
plt.show()

## 8. Profit Margin Analysis

Let's calculate and analyze profit margins across different dimensions.

In [None]:
# Calculate Profit Margin (Sales-based)
df['Profit_Margin'] = df['Profit'] / df['Sales']
df['Profit_Margin'] = df['Profit_Margin'].replace([np.inf, -np.inf], np.nan).fillna(0)

# Display summary
print("Profit Margin Summary Statistics:")
display(df['Profit_Margin'].describe())

In [None]:
# Visualize profit margin distribution
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['Profit_Margin'], kde=True, bins=20)
plt.title('Profit Margin Distribution')
plt.axvline(df['Profit_Margin'].mean(), color='red', linestyle='--', label=f'Mean: {df["Profit_Margin"].mean():.2%}')
plt.xlabel('Profit Margin')
plt.legend()

plt.subplot(1, 2, 2)
sns.boxplot(y=df['Profit_Margin'])
plt.title('Profit Margin Boxplot')
plt.ylabel('Profit Margin')

plt.tight_layout()
plt.savefig('../results/plots/profit_margin_distribution.png')
plt.show()

In [None]:
# Analyze profit margin by segment
segment_margin = df.groupby('Segment')['Profit_Margin'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=segment_margin.index, y=segment_margin.values)
plt.title('Average Profit Margin by Segment')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin (%)')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_segment.png')
plt.show()

In [None]:
# Analyze profit margin by product (top 10 products)
product_margin = df.groupby('Product')['Profit_Margin'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=product_margin.index, y=product_margin.values)
plt.title('Average Profit Margin by Product (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin (%)')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_product.png')
plt.show()

In [None]:
# Analyze profit margin by country (top 10 countries)
country_margin = df.groupby('Country')['Profit_Margin'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=country_margin.index, y=country_margin.values)
plt.title('Average Profit Margin by Country (Top 10)')
plt.xticks(rotation=45)
plt.ylabel('Profit Margin (%)')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/profit_margin_by_country.png')
plt.show()

## 9. Revenue-Cost-Profit Relationship

Let's analyze the relationship between Sales, COGS, and Profit.

In [None]:
# Create a scatter plot of Sales vs. COGS, colored by Profit
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['Sales'], df['COGS'], c=df['Profit'], cmap='RdYlGn', alpha=0.7)
plt.colorbar(scatter, label='Profit ($)')
plt.title('Sales vs. COGS (Colored by Profit)')
plt.xlabel('Sales ($)')
plt.ylabel('COGS ($)')
plt.grid(True)

# Add break-even line (Sales = COGS)
max_val = max(df['Sales'].max(), df['COGS'].max())
plt.plot([0, max_val], [0, max_val], 'k--', label='Break-Even Line (Profit=0)')
plt.legend()
plt.tight_layout()
plt.savefig('../results/plots/revenue_cost_profit_relationship.png')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

# Sample data for large datasets
sample_size = min(1000, len(df))
sample_df = df.sample(n=sample_size, random_state=42)

# Create 3D scatter plot
scatter = ax.scatter(
    sample_df['Sales'],
    sample_df['COGS'],
    sample_df['Profit'],
    c=sample_df['Profit_Margin'],
    cmap='RdYlGn',
    alpha=0.7
)

ax.set_xlabel('Sales ($)')
ax.set_ylabel('COGS ($)')
ax.set_zlabel('Profit ($)')
ax.set_title('3D Relationship: Sales, COGS, and Profit')
ax.view_init(elev=20, azim=45)  # Better angle

# Add color bar
cbar = fig.colorbar(scatter, ax=ax, pad=0.1)
cbar.set_label('Profit Margin (%)')

plt.tight_layout()
plt.savefig('../results/plots/revenue_cost_profit_3d.png')
plt.show()

## 11. Additional Analysis: Revenue and Profit Quartiles

Let's segment our data into quartiles based on Revenue(Sales) and Profit to better understand the distribution of financial performance.

In [None]:
# Create Revenue quartiles
df['Revenue_Quartile'] = pd.qcut(df['Sales'], 4, labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4 (High)'])

# Create Profit quartiles
df['Profit_Quartile'] = pd.qcut(df['Profit'], 4, labels=['Q1 (Low)', 'Q2', 'Q3', 'Q4 (High)'])

# Display the count of records in each quartile
print("Revenue Quartile Distribution:")
print(df['Revenue_Quartile'].value_counts())

print("\nProfit Quartile Distribution:")
print(df['Profit_Quartile'].value_counts())

In [None]:
# Analyze metrics by Revenue Quartile
revenue_quartile_analysis = df.groupby('Revenue_Quartile').agg(
    Sales_Mean=('Sales', 'mean'),
    COGS_Mean=('COGS', 'mean'),
    Profit_Mean=('Profit', 'mean'),
    Profit_Margin_Mean=('Profit_Margin', 'mean'),
    ROA_Mean=('ROA', 'mean')
).round(2)
print("Revenue Quartile Analysis:")
display(revenue_quartile_analysis)

In [None]:
# Visualize metrics by Revenue quartile
plt.figure(figsize=(14, 10))

# Profit by Revenue quartile
plt.subplot(2, 2, 1)
sns.barplot(x='Revenue_Quartile', y='Profit', data=df)
plt.title('Average Profit by Revenue Quartile')
plt.ylabel('Profit ($)')
plt.grid(axis='y')

# Profit Margin by Revenue quartile
plt.subplot(2, 2, 2)
sns.barplot(x='Revenue_Quartile', y='Profit_Margin', data=df)
plt.title('Average Profit Margin by Revenue Quartile')
plt.ylabel('Profit Margin')
plt.grid(axis='y')

# ROA by Revenue quartile
plt.subplot(2, 2, 3)
sns.barplot(x='Revenue_Quartile', y='ROA', data=df)
plt.title('Average ROA by Revenue Quartile')
plt.ylabel('ROA')
plt.grid(axis='y')

# Cost by Revenue quartile
plt.subplot(2, 2, 4)
sns.barplot(x='Revenue_Quartile', y='COGS', data=df)
plt.title('Average Cost by Revenue Quartile')
plt.ylabel('Cost ($)')
plt.grid(axis='y')

plt.tight_layout()
plt.savefig('../results/plots/metrics_by_revenue_quartile.png')
plt.show()

In [None]:
# Segment distribution across Revenue Quartiles
segment_revenue_quartile = pd.crosstab(
    df['Segment'],
    df['Revenue_Quartile'],
    normalize='index'
).mul(100).round(1)

print("Segment Distribution by Revenue Quartile:")
display(segment_revenue_quartile)

In [None]:
plt.figure(figsize=(14, 8))
segment_revenue_quartile.plot(
    kind='bar',
    stacked=True,
    colormap='viridis'
)
plt.title('Segment Distribution by Revenue Quartile')
plt.xlabel('Segment')
plt.ylabel('Percentage (%)')
plt.legend(title='Revenue Quartile')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/segment_revenue_quartile.png')
plt.show()

In [None]:
# Create a cross-tabulation of Revenue and Profit quartiles
revenue_profit_quartile = pd.crosstab(df['Revenue_Quartile'], df['Profit_Quartile'])

# Display the cross-tabulation
print("Revenue-Profit Quartile Cross-tabulation:\n")
revenue_profit_quartile

In [None]:
# Visualize the Revenue-Profit quartile relationship with a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(revenue_profit_quartile, annot=True, cmap='viridis', fmt='d')
plt.title('Revenue-Profit Quartile Relationship')
plt.tight_layout()
plt.savefig('../results/plots/revenue_profit_quartile_heatmap.png')
plt.show()

## 12. Export Key Findings for Reporting

Let's export some key findings and visualizations for use in reports.

In [None]:
# Create a summary DataFrame for key metrics by segment
segment_summary = df.groupby('Segment').agg(
    Sales_Mean=('Sales', 'mean'),
    COGS_Mean=('COGS', 'mean'),
    Profit_Mean=('Profit', 'mean'),
    Profit_Margin_Mean=('Profit_Margin', 'mean'),
    ROA_Mean=('ROA', 'mean'),
    Count=('Segment', 'count')
).sort_values('Profit_Mean', ascending=False).round(2)

# Export to CSV
segment_summary.to_csv('../results/reports/segment_performance_summary.csv')

# Create a summary DataFrame for key metrics by country (top 20)
country_summary = df.groupby('Country').agg(
    Sales_Mean=('Sales', 'mean'),
    Profit_Mean=('Profit', 'mean'),
    Profit_Margin_Mean=('Profit_Margin', 'mean'),
    ROA_Mean=('ROA', 'mean'),
    Count=('Country', 'count')
).sort_values('Profit_Mean', ascending=False).head(20).round(2)

# Export to CSV
country_summary.to_csv('../results/reports/country_performance_summary.csv')

# Create a summary DataFrame for key metrics by product (top 20)
product_summary = df.groupby('Product').agg(
    Sales_Mean=('Sales', 'mean'),
    Profit_Mean=('Profit', 'mean'),
    Profit_Margin_Mean=('Profit_Margin', 'mean'),
    ROA_Mean=('ROA', 'mean'),
    Count=('Product', 'count')
).sort_values('Profit_Mean', ascending=False).head(20).round(2)

# Export to CSV
product_summary.to_csv('../results/reports/product_performance_summary.csv')

print("Summary reports exported to the 'results/reports' directory.")

In [None]:
# Create a comprehensive correlation matrix and export it
correlation_matrix.to_csv('../results/reports/correlation_matrix.csv')

# Export the list of highly correlated pairs
high_corr_df = pd.DataFrame(high_corr_pairs, columns=['Variable 1', 'Variable 2', 'Correlation'])
high_corr_df.to_csv('../results/reports/highly_correlated_pairs.csv', index=False)

print("Correlation analysis exported to the 'results/reports' directory.")