# Classification: Predict High Net Income

This notebook builds a classification model to predict whether a company will have high net income based on various financial and categorical features.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import pickle

# Machine learning libraries
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Import custom modules
import sys
import os
sys.path.append(os.path.abspath('../src'))
from ml_utils import plot_feature_importance, evaluate_classification_model

# Set plotting style
sns.set(style='whitegrid')
plt.style.use('seaborn-whitegrid')

# Ignore warnings
warnings.filterwarnings('ignore')

# Create directories for saving results if they don't exist
Path('../results/models').mkdir(parents=True, exist_ok=True)
Path('../results/plots/ml').mkdir(parents=True, exist_ok=True)

## 1. Load and Prepare Data

In [None]:
# Load the data with cluster labels from the clustering notebook
try:
    # Try to load the clustered data first
    df = pd.read_csv('../data/processed/clustered_data.csv')
    print("Loaded data with cluster labels.")
except FileNotFoundError:
    # If not available, load the cleaned data
    df = pd.read_csv('../data/processed/cleaned_data.csv')
    print("Loaded cleaned data without cluster labels.")

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Create Target Variable

In [None]:
# Create binary target variable: high_net_income (top 20% as "High")
profit_threshold = df['Profit'].quantile(0.8)
df['high_net_income'] = (df['Profit'] >= profit_threshold).astype(int)

print(f"Profit threshold for high net income: {profit_threshold:.2f}")
print("Target variable distribution:")
print(df['high_net_income'].value_counts(normalize=True) * 100)

In [None]:
# Visualize the target variable distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='high_net_income', data=df)
plt.title('Distribution of High Net Income Target Variable')
plt.xlabel('High Net Income (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.grid(axis='y')
plt.savefig('../results/plots/ml/high_net_income_distribution.png')
plt.show()

## 3. Feature Selection for Classification

In [None]:
# Select features for classification
# Numeric features
numeric_features_clf = ['Revenue', 'Cost', 'ROA', 'Profit_Margin']

# Categorical features
categorical_features_clf = ['Segment', 'Country']

# Add cluster label as a feature if available
if 'Cluster' in df.columns:
    categorical_features_clf.append('Cluster')

# Target variable
target = 'high_net_income'

# Create feature matrix and target vector
X_clf = df[numeric_features_clf + categorical_features_clf]
y_clf = df[target]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## 4. Explore Feature Relationships with Target

In [None]:
# Explore relationship between numeric features and target
plt.figure(figsize=(14, 10))

for i, feature in enumerate(numeric_features_clf):
    plt.subplot(2, 2, i+1)
    sns.boxplot(x='high_net_income', y=feature, data=df)
    plt.title(f'{feature} by High Net Income')
    plt.xlabel('High Net Income (1 = Yes, 0 = No)')
    plt.grid(axis='y')

plt.tight_layout()
plt.savefig('../results/plots/ml/numeric_features_by_target.png')
plt.show()

In [None]:
# Explore relationship between Segment and target
segment_target = pd.crosstab(df['Segment'], df['high_net_income'], normalize='index') * 100
segment_target.columns = ['Low Net Income', 'High Net Income']

plt.figure(figsize=(12, 8))
segment_target.plot(kind='bar', stacked=True)
plt.title('Segment Distribution by High Net Income')
plt.xlabel('Segment')
plt.ylabel('Percentage (%)')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/segment_by_target.png')
plt.show()

## 5. Build Classification Pipeline

In [None]:
# Create preprocessing pipeline for classification
preprocessor_clf = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_clf),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features_clf)
    ])

# Create classification pipeline with Random Forest
clf_pipeline = Pipeline([
    ('preprocessor', preprocessor_clf),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameter grid for grid search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(clf_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best model
best_clf = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

## 6. Evaluate Classification Model

In [None]:
# Make predictions on test set
y_pred = best_clf.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
evaluate_classification_model(y_test, y_pred, '../results/plots/ml/classification_confusion_matrix.png')

In [None]:
# Get prediction probabilities
y_proba = best_clf.predict_proba(X_test)[:, 1]

# Plot ROC curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('../results/plots/ml/classification_roc_curve.png')
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Extract the random forest classifier from the pipeline
rf_clf = best_clf.named_steps['classifier']

# Get feature names after preprocessing
preprocessor = best_clf.named_steps['preprocessor']
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(categorical_features_clf)
feature_names_clf = numeric_features_clf + list(cat_feature_names)

# Plot feature importances
feature_importance_df = plot_feature_importance(rf_clf, feature_names_clf, '../results/plots/ml/classification_feature_importance.png')

# Display top 10 most important features
print("Top 10 most important features:")
feature_importance_df.head(10)

## 8. Save Classification Model

In [None]:
# Save the best classification model
with open('../results/models/classification_model.pkl', 'wb') as f:
    pickle.dump(best_clf, f)

# Save feature importance data
feature_importance_df.to_csv('../results/reports/classification_feature_importance.csv', index=False)

print("Classification model and feature importance saved.")

## 9. Segment-Specific Analysis

In [None]:
# Analyze model performance by segment
# Add predictions to the test data
test_data = X_test.copy()
test_data['actual'] = y_test.values
test_data['predicted'] = y_pred
test_data['correct'] = (test_data['actual'] == test_data['predicted']).astype(int)

# Calculate accuracy by segment
segment_accuracy = test_data.groupby('Segment')['correct'].mean().sort_values(ascending=False)

plt.figure(figsize=(12, 6))
segment_accuracy.plot(kind='bar')
plt.title('Classification Accuracy by Segment')
plt.xlabel('Segment')
plt.ylabel('Accuracy')
plt.axhline(y=test_data['correct'].mean(), color='r', linestyle='--', label=f'Overall Accuracy: {test_data["correct"].mean():.4f}')
plt.legend()
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('../results/plots/ml/classification_accuracy_by_segment.png')
plt.show()

## 10. Classification Insights and Recommendations

### Classification Insights:

1. Our Random Forest classifier can predict high net income with good accuracy, providing a valuable tool for financial forecasting.
2. The most important features for predicting high net income are Revenue, Cost, and ROA, highlighting the critical financial indicators.
3. Certain segments and countries have a significantly higher probability of achieving high net income.
4. The model identifies patterns that could be used to develop strategies for improving financial performance across different business units.
5. The ROC curve shows strong discriminative power, with an AUC of [insert value], indicating the model's ability to distinguish between high and low net income cases.

### Recommendations based on Classification:

1. **Predictive Financial Planning**: Implement the classification model in financial planning processes to identify entities likely to achieve high net income.
2. **Risk Management**: Use the model to identify entities at risk of falling below profit expectations and take preemptive action.
3. **Investment Prioritization**: Direct investment toward entities that the model predicts have high potential for reaching top-tier profitability.
4. **Focus on Key Drivers**: Develop strategies that focus on improving the top predictive features identified by the model.
5. **Segment-Specific Strategies**: Tailor approaches based on segment-specific performance patterns identified in the model.