# Regression: Predict Profit

This notebook builds regression models to predict profit based on various financial and categorical features.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import pickle

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Import custom modules
import sys
import os
sys.path.append(os.path.abspath('../src'))
from ml_utils import plot_feature_importance, evaluate_regression_model, plot_residuals

# Set plotting style
sns.set(style='whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

# Ignore warnings
warnings.filterwarnings('ignore')

# Create directories for saving results if they don't exist
Path('../results/models').mkdir(parents=True, exist_ok=True)
Path('../results/plots/ml').mkdir(parents=True, exist_ok=True)

## 1. Load and Prepare Data

In [None]:
# Load the data with cluster labels if available
try:
    # Try to load the clustered data first
    df = pd.read_csv('../data/processed/clustered_data.csv')
    print("Loaded data with cluster labels.")
except FileNotFoundError:
    # If not available, load the cleaned data
    df = pd.read_csv('../data/processed/cleaned_data.csv')
    print("Loaded cleaned data without cluster labels.")

# Strip spaces from column names
df.columns = df.columns.str.strip()

# Replace spaces in column names with underscores (if needed)
column_mapping = {col: col.replace(' ', '_') for col in df.columns if ' ' in col}
if column_mapping:
    df = df.rename(columns=column_mapping)

# Clean financial columns (Sales, COGS, Profit) by removing $, commas, and spaces
financial_cols = ['Sales', 'COGS', 'Profit']
for col in financial_cols:
    df[col] = df[col].astype(str).str.replace(r'[^0-9.-]', '', regex=True)
    df[col] = df[col].replace('', np.nan)
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN in financial columns (or fill as needed)
df = df.dropna(subset=financial_cols)

# Recalculate derived metrics (e.g., ROA and Profit_Margin)
df['ROA'] = df['Profit'] / df['COGS']
df['Profit_Margin'] = df['Profit'] / df['Sales']

# Handle potential division by zero or infinity
df['ROA'] = df['ROA'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['Profit_Margin'] = df['Profit_Margin'].replace([np.inf, -np.inf], np.nan).fillna(0)

print(f"Dataset shape: {df.shape}")

## 2. Feature Selection for Regression

In [None]:
# Select features for regression
# Numeric features
numeric_features_reg = ['Sales', 'COGS', 'ROA', 'Profit_Margin']

# Categorical features
categorical_features_reg = ['Segment', 'Country']

# Add cluster label as a feature if available
if 'Cluster' in df.columns:
    categorical_features_reg.append('Cluster')

# Target variable
target = 'Profit'
target_reg = 'Profit'

# Create feature matrix and target vector
X_reg = df[numeric_features_reg + categorical_features_reg]
y_reg = df[target]

# Split data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(f"Training set shape: {X_train_reg.shape}")
print(f"Testing set shape: {X_test_reg.shape}")

## 3. Explore Feature Relationships with Target

In [None]:
# Explore relationship between numeric features and target
plt.figure(figsize=(14, 10))

for i, feature in enumerate(numeric_features_reg):
    plt.subplot(2, 2, i+1)
    plt.scatter(df[feature], df[target_reg], alpha=0.5)
    plt.title(f'{feature} vs {target_reg}')
    plt.xlabel(feature)
    plt.ylabel(target_reg)
    plt.grid(True)

plt.tight_layout()
plt.savefig('../results/plots/ml/numeric_features_vs_target_reg.png')
plt.show()

### Correlation with Target

Let's check the correlation between numeric features and the target variable.

In [None]:
# Calculate correlation between numeric features and target
corr_with_target = df[numeric_features_reg + [target_reg]].corr()[target_reg].sort_values(ascending=False)

plt.figure(figsize=(10, 6))
corr_with_target.drop(target_reg).plot(kind='bar')
plt.title(f'Correlation with {target_reg}')
plt.xlabel('Feature')
plt.ylabel('Correlation Coefficient')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/correlation_with_target_reg.png')
plt.show()

print("Correlation with target:")
print(corr_with_target)

In [None]:
# Explore relationship between Segment and target
plt.figure(figsize=(12, 6))
sns.boxplot(x='Segment', y=target_reg, data=df)
plt.title(f'Distribution of {target_reg} by Segment')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/profit_by_segment.png')
plt.show()

## 4. Build Regression Pipeline

In [None]:
# Create preprocessing pipeline for regression
preprocessor_reg = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_reg),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_reg)
    ])

# Create regression pipelines with different models
ridge_pipeline = Pipeline([
    ('preprocessor', preprocessor_reg),
    ('regressor', Ridge(random_state=42))
])

lasso_pipeline = Pipeline([
    ('preprocessor', preprocessor_reg),
    ('regressor', Lasso(random_state=42))
])

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor_reg),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# Define hyperparameter grids for each model
ridge_param_grid = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

lasso_param_grid = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}

rf_param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

In [None]:
# Perform grid search for Ridge regression
print("Tuning Ridge Regression...")
ridge_grid_search = GridSearchCV(ridge_pipeline, ridge_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train_reg, y_train_reg)
best_ridge = ridge_grid_search.best_estimator_
print(f"Best Ridge parameters: {ridge_grid_search.best_params_}")
print(f"Best Ridge RMSE: {np.sqrt(-ridge_grid_search.best_score_):.2f}")

# Perform grid search for Lasso regression
print("\nTuning Lasso Regression...")
lasso_grid_search = GridSearchCV(lasso_pipeline, lasso_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train_reg, y_train_reg)
best_lasso = lasso_grid_search.best_estimator_
print(f"Best Lasso parameters: {lasso_grid_search.best_params_}")
print(f"Best Lasso RMSE: {np.sqrt(-lasso_grid_search.best_score_):.2f}")

# Perform grid search for Random Forest regression
print("\nTuning Random Forest Regression...")
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
rf_grid_search.fit(X_train_reg, y_train_reg)
best_rf = rf_grid_search.best_estimator_
print(f"Best Random Forest parameters: {rf_grid_search.best_params_}")
print(f"Best Random Forest RMSE: {np.sqrt(-rf_grid_search.best_score_):.2f}")

## 5. Compare Models

In [None]:
# Make predictions on test set with each model
ridge_pred = best_ridge.predict(X_test_reg)
lasso_pred = best_lasso.predict(X_test_reg)
rf_pred = best_rf.predict(X_test_reg)

# Calculate metrics for each model
models = ['Ridge', 'Lasso', 'Random Forest']
predictions = [ridge_pred, lasso_pred, rf_pred]
metrics = []

for model_name, y_pred in zip(models, predictions):
    mse = mean_squared_error(y_test_reg, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)
    
    metrics.append({
        'Model': model_name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'R²': r2
    })

# Create a DataFrame with metrics
metrics_df = pd.DataFrame(metrics).set_index('Model')
print("Model Comparison:")
metrics_df

In [None]:
# Visualize model comparison
plt.figure(figsize=(14, 6))

# RMSE comparison
plt.subplot(1, 2, 1)
sns.barplot(x=metrics_df.index, y='RMSE', data=metrics_df)
plt.title('RMSE by Model')
plt.ylabel('RMSE')
plt.grid(axis='y')

# R² comparison
plt.subplot(1, 2, 2)
sns.barplot(x=metrics_df.index, y='R²', data=metrics_df)
plt.title('R² by Model')
plt.ylabel('R²')
plt.grid(axis='y')

plt.tight_layout()
plt.savefig('../results/plots/ml/regression_model_comparison.png')
plt.show()

## 6. Analyze Best Model

In [None]:
# Select the best model based on R²
best_model_name = metrics_df['R²'].idxmax()
best_model = {'Ridge': best_ridge, 'Lasso': best_lasso, 'Random Forest': best_rf}[best_model_name]
best_pred = {'Ridge': ridge_pred, 'Lasso': lasso_pred, 'Random Forest': rf_pred}[best_model_name]

print(f"Best model: {best_model_name}")

# Define the output path for evaluation metrics
output_path = '../results/plots/ml/regression_evaluation.png'

# Evaluate the best model
print(f"\nDetailed evaluation of {best_model_name} model:")
model_metrics = evaluate_regression_model(y_test_reg, best_pred, output_path)

# Print detailed metrics
for metric, value in model_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Plot actual vs predicted values
plt.figure(figsize=(10, 8))
plt.scatter(y_test_reg, best_pred, alpha=0.5)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--')
plt.xlabel('Actual Profit')
plt.ylabel('Predicted Profit')
plt.title(f'{best_model_name}: Actual vs Predicted Profit')
plt.grid(True)
plt.tight_layout()
plt.savefig('../results/plots/ml/actual_vs_predicted_profit.png')
plt.show()

In [None]:
# Plot residuals
plot_residuals(y_test_reg, best_pred, '../results/plots/ml/profit_residuals.png')

## 7. Feature Importance Analysis

In [None]:
# Extract feature importance if the best model is Random Forest
if best_model_name == 'Random Forest':
    # Extract the random forest regressor from the pipeline
    rf_reg = best_model.named_steps['regressor']
    
    # Get feature names after preprocessing
    preprocessor = best_model.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    cat_feature_names = ohe.get_feature_names_out(categorical_features_reg)
    feature_names_reg = numeric_features_reg + list(cat_feature_names)
    
    # Plot feature importances
    feature_importance_df = plot_feature_importance(rf_reg, feature_names_reg, '../results/plots/ml/regression_feature_importance.png')
    
    # Display top 10 most important features
    print("Top 10 most important features:")
    feature_importance_df.head(10)
elif best_model_name == 'Ridge' or best_model_name == 'Lasso':
    # Extract the regressor from the pipeline
    regressor = best_model.named_steps['regressor']
    
    # Get feature names after preprocessing
    preprocessor = best_model.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    cat_feature_names = ohe.get_feature_names_out(categorical_features_reg)
    feature_names_reg = numeric_features_reg + list(cat_feature_names)
    
    # Get coefficients
    coefficients = regressor.coef_
    
    # Create a DataFrame for easier sorting
    coef_df = pd.DataFrame({
        'Feature': feature_names_reg,
        'Coefficient': coefficients
    }).sort_values('Coefficient', key=abs, ascending=False)
    
# Plot coefficients
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Coefficient', y='Feature', data=coef_df.head(20))
    plt.title(f'{best_model_name} Coefficients (Top 20 by Magnitude)')
    plt.grid(axis='x')
    plt.tight_layout()
    plt.savefig('../results/plots/ml/regression_coefficients.png')
    plt.show()
    
    # Display top 10 coefficients
    print("Top 10 coefficients by magnitude:")
    print(coef_df.head(10))

## 8. Save Regression Model

In [None]:
# Save the best regression model
with open('../results/models/regression_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save model metrics
metrics_df.to_csv('../results/reports/regression_model_metrics.csv')

print("Regression model and metrics saved.")

## 9. Segment-Specific Analysis

In [None]:
# Analyze model performance by segment
# Add predictions to the test data
test_data_reg = X_test_reg.copy()
test_data_reg['actual'] = y_test_reg.values
test_data_reg['predicted'] = best_pred
test_data_reg['error'] = test_data_reg['actual'] - test_data_reg['predicted']
test_data_reg['abs_error'] = abs(test_data_reg['error'])

# Calculate MAE by segment
segment_mae = test_data_reg.groupby('Segment')['abs_error'].mean().sort_values()

plt.figure(figsize=(12, 6))
segment_mae.plot(kind='bar')
plt.title('Mean Absolute Error by Segment')
plt.xlabel('Segment')
plt.ylabel('MAE')
plt.axhline(y=test_data_reg['abs_error'].mean(), color='r', linestyle='--', label=f'Overall MAE: {test_data_reg["abs_error"].mean():.2f}')
plt.legend()
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/regression_mae_by_segment.png')
plt.show()

## 10. Regression Insights and Recommendations

## The regression models provided insights into the relationships between financial metrics and target variables (e.g., Profit). Key findings include:

 *Feature Importance*  :
Important predictors of Profit included Sales , COGS , and ROA , reflecting their significant influence on financial outcomes.
Model Performance :
The models demonstrated strong predictive power, with metrics like R² scores and RMSE indicating good fit and low error.
Recommendations Based on Regression :
 *Financial Forecasting* :
Use the regression models to forecast future financial performance based on historical trends and key drivers.
 *Resource Allocation* :
Allocate resources strategically by prioritizing investments in factors that drive profitability, as identified by the regression analysis.
 *Scenario Analysis* :
Conduct scenario analysis to understand how changes in key predictors (e.g., Sales, COGS) impact profitability.
 *Continuous Monitoring* :
Continuously monitor key financial metrics to ensure early detection of deviations from expected performance.

## 11. Conclusion

## Résumé des Principaux Résultats ##
    **Analyse Exploratoire des Données (EDA)** :
Nous avons identifié les principales caractéristiques financières et leurs distributions, révélant des insights sur les performances sectorielles et les corrélations entre variables.
    **Clustering** :
Le clustering a segmenté les entreprises en groupes distincts selon leur performance financière, mettant en évidence des clusters à haute rentabilité et à faible rentabilité.
    **Classification** :
Un modèle de classification robuste a été développé pour prédire les revenus élevés, en se basant sur des indicateurs financiers critiques tels que le chiffre d'affaires, les coûts et le ROA (Return on Assets).
    **Régression** :
Les modèles de régression ont permis de comprendre les relations entre les métriques financières et les variables cibles, fournissant des insights actionnables pour la prévision et l'optimisation.
## Recommandations ##
    **Prise de Décisions Stratégiques** :
Utiliser les insights issus du clustering et de la classification pour informer les décisions stratégiques concernant l'allocation des ressources et la gestion des risques.
    **Amélioration Continue**  :
Mettre régulièrement à jour les modèles avec de nouvelles données pour maintenir leur précision et leur pertinence.
    **Analyses Avancées** :
Explorer des techniques avancées telles que l'analyse des séries temporelles et l'apprentissage profond pour améliorer les capacités prédictives.
    **Qualité des Données** :
Améliorer la qualité des données en traitant les valeurs manquantes, les valeurs aberrantes et les incohérences pour affiner les performances des modèles.
## Travaux Futurs ##
    **Analyse Temporelle** :
Intégrer une analyse des séries temporelles pour comprendre les tendances et la saisonnalité dans les performances financières.
Intégration de Données Externes :
Incorporer des indicateurs économiques externes (par exemple, croissance du PIB, taux d'intérêt) pour enrichir l'ensemble de données et améliorer la précision des modèles.
    **Rapports Automatisés**  :
Développer des outils de rapports automatisés pour visualiser les principaux insights et faciliter la prise de décision.
    **Surveillance en Temps Réel** :
Implémenter des systèmes de surveillance en temps réel pour suivre les métriques financières clés et détecter rapidement les anomalies.