# Opportunity Win Model

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, roc_curve)
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

### 1. Load Data

In [7]:
print("STEP 1: Loading Data...")

# For local file
df = pd.read_csv("../data/processed/cleaned_data.csv")

print(f"Dataset loaded: {df.shape}")
print(f"\nFirst 3 rows:")
print(df.head(3))
print("\nColumn names:")
print(df.columns.tolist())

STEP 1: Loading Data...
Dataset loaded: (8800, 70)

First 3 rows:
   sales_agent  account engage_date  close_date  close_value  sales_price  \
0           20        8  2016-10-20  2017-03-01       1054.0         1096   
1            6       38  2016-10-25  2017-03-11       4514.0         4821   
2            6        8  2016-10-25  2017-03-07         50.0           55   

   year_established  revenue  employees  office_location  has_close_date  \
0            2001.0   718.62     2448.0               14               1   
1            2002.0  3178.24     4540.0               14               1   
2            2001.0   718.62     2448.0               14               1   

   close_value_log  revenue_log  employees_log  product_GTK 500  \
0         6.961296     6.578723       7.803435                0   
1         8.415160     8.064397       8.420903                0   
2         3.931826     6.578723       7.803435                0   

   product_GTX BASIC  product_GTX PLUS BASIC  produ

### 2. Data Exploration

In [9]:
print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
missing = df.isnull().sum()
print(missing[missing > 0])

# Check if we have a target variable
if 'won_deal' in df.columns:
    print("\n✅ Target variable 'won_deal' found")
    print(f"Win rate: {df['won_deal'].mean():.2%}")
elif 'deal_stage' in df.columns:
    print("\n✅ Creating target from 'deal_stage'")
    df['won_deal'] = (df['deal_stage'] == 'WON').astype(int)
    print(f"Win rate: {df['won_deal'].mean():.2%}")


Data types:
sales_agent             int64
account                 int64
engage_date            object
close_date             object
close_value           float64
                       ...   
account_age           float64
rev_per_employee      float64
agent_closed_deals      int64
won_deal                int64
account_win_rate      float64
Length: 70, dtype: object

Missing values:
engage_date          500
close_date          2089
close_value         2089
close_value_log     2089
engage_year          500
engage_month         500
engage_dayofweek     500
days_to_close       2089
dtype: int64

✅ Target variable 'won_deal' found
Win rate: 76.26%


### 3. Remove Leakage

In [11]:
# CRITICAL: These features cause data leakage
leakage_features = [
    # Direct leakage - only known after outcome
    'close_date',           # Only exists if deal closed
    'close_value',          # Only known after close
    'has_close_date',       # Reveals if deal closed
    'close_value_log',      # Derived from close_value
    'days_to_close',        # Can only calculate after close
    'closed_within_30d',    # Only known after close
    
    # Target leakage - these ARE the answer
    'deal_stage_WON',       # This is what we're predicting!
    'deal_stage_LOST',      # This is what we're predicting!
    'deal_stage_ENGAGING',  # Current stage reveals outcome
    'deal_stage_PROSPECTING',
    
    # Aggregation leakage - includes future information
    'agent_closed_deals',   # May include current deal
    'account_win_rate',     # CRITICAL: Includes future deals and depenpency on if a deal is closed
    
    # Identifiers (not useful for prediction)
    'opportunity_id',
    'deal_stage',
    'account',              # Too many unique values, use features derived from it
    'sales_agent',          # Will encode this properly
]

# Remove leakage columns that exist
leakage_removed = [col for col in leakage_features if col in df.columns]
df_clean = df.drop(columns=leakage_removed, errors='ignore')

print(f"❌ Removed {len(leakage_removed)} leakage features:")
for col in leakage_removed:
    print(f"   - {col}")

print(f"\n✅ Clean dataset shape: {df_clean.shape}")

❌ Removed 14 leakage features:
   - close_date
   - close_value
   - has_close_date
   - close_value_log
   - days_to_close
   - closed_within_30d
   - deal_stage_WON
   - deal_stage_LOST
   - deal_stage_ENGAGING
   - deal_stage_PROSPECTING
   - agent_closed_deals
   - account_win_rate
   - account
   - sales_agent

✅ Clean dataset shape: (8800, 56)


### 4. Feature Engineering

In [12]:
# Convert dates
if 'engage_date' in df_clean.columns:
    df_clean['engage_date'] = pd.to_datetime(df_clean['engage_date'], errors='coerce')
    
    # Extract date features
    df_clean['engage_year'] = df_clean['engage_date'].dt.year
    df_clean['engage_month'] = df_clean['engage_date'].dt.month
    df_clean['engage_day_of_week'] = df_clean['engage_date'].dt.dayofweek
    df_clean['engage_quarter'] = df_clean['engage_date'].dt.quarter
    df_clean['is_month_end'] = (df_clean['engage_date'].dt.day > 25).astype(int)
    df_clean['is_quarter_end'] = (df_clean['engage_month'] % 3 == 0).astype(int)
    
    print("✅ Created temporal features from engage_date")

# Company age
if 'year_established' in df_clean.columns:
    df_clean['company_age'] = 2017 - df_clean['year_established']
    print("✅ Created company_age feature")

# Revenue per employee (efficiency metric)
if 'revenue' in df_clean.columns and 'employees' in df_clean.columns:
    df_clean['revenue_per_employee'] = df_clean['revenue'] / (df_clean['employees'] + 1)
    print("✅ Created revenue_per_employee feature")

# Price to revenue ratio (deal size relative to company)
if 'sales_price' in df_clean.columns and 'revenue' in df_clean.columns:
    df_clean['price_to_revenue_ratio'] = df_clean['sales_price'] / (df_clean['revenue'] + 1)
    print("✅ Created price_to_revenue_ratio feature")

# Log transforms for skewed features
if 'revenue' in df_clean.columns:
    df_clean['revenue_log'] = np.log1p(df_clean['revenue'])
    
if 'employees' in df_clean.columns:
    df_clean['employees_log'] = np.log1p(df_clean['employees'])
    
if 'sales_price' in df_clean.columns:
    df_clean['sales_price_log'] = np.log1p(df_clean['sales_price'])
    
print("✅ Created log-transformed features")

✅ Created temporal features from engage_date
✅ Created company_age feature
✅ Created revenue_per_employee feature
✅ Created price_to_revenue_ratio feature
✅ Created log-transformed features


### 5. Feature Prep and Target

In [14]:
# Define target
target = 'won_deal'
y = df_clean[target].astype(int)

# Remove non-feature columns
cols_to_drop = [
    target,
    'engage_date',  # Already extracted features
    'office_location',  # Keep if you want, but high cardinality
]

X = df_clean.drop(columns=[col for col in cols_to_drop if col in df_clean.columns])

# Select only numeric features (encoded categorical should already be numeric)
numeric_features = X.select_dtypes(include=['number']).columns.tolist()
X = X[numeric_features]

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nUsing {len(numeric_features)} features:")
for i, col in enumerate(numeric_features[:20], 1):  # Show first 20
    print(f"   {i}. {col}")
if len(numeric_features) > 20:
    print(f"   ... and {len(numeric_features) - 20} more")

Feature matrix shape: (8800, 61)
Target shape: (8800,)

Using 61 features:
   1. sales_price
   2. year_established
   3. revenue
   4. employees
   5. revenue_log
   6. employees_log
   7. product_GTK 500
   8. product_GTX BASIC
   9. product_GTX PLUS BASIC
   10. product_GTX PLUS PRO
   11. product_GTX PRO
   12. product_MG ADVANCED
   13. product_MG SPECIAL
   14. manager_CARA LOSCH
   15. manager_CELIA ROUCHE
   16. manager_DUSTIN BRINKMANN
   17. manager_MELVIN MARXEN
   18. manager_ROCCO NEUBERT
   19. manager_SUMMER SEWALD
   20. regional_office_CENTRAL
   ... and 41 more


### 6. Missing Values

In [15]:
print(f"\nMissing values before imputation:")
missing_before = X.isnull().sum()
print(missing_before[missing_before > 0])

# Use median imputation for numeric features
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)

print(f"\nImputed missing values using median strategy")
print(f"Missing values after: {X.isnull().sum().sum()}")


Missing values before imputation:
engage_year           500
engage_month          500
engage_dayofweek      500
engage_day_of_week    500
engage_quarter        500
dtype: int64

Imputed missing values using median strategy
Missing values after: 0


### 7. Train Test Split

In [18]:
# IMPORTANT: Split BEFORE any scaling to prevent train-test contamination
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\nTraining win rate: {y_train.mean():.2%}")
print(f"Test win rate: {y_test.mean():.2%}")

Training set: 7040 samples (80.0%)
Test set: 1760 samples (20.0%)

Training win rate: 76.26%
Test win rate: 76.25%


### 8. Feature Scaling

In [20]:
# Scale features - fit only on training data!
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  # Use training statistics

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

print("Features scaled using StandardScaler")
print("Scaler fit only on training data (no test contamination)")

Features scaled using StandardScaler
Scaler fit only on training data (no test contamination)


### 9. Class Imablance

In [22]:
print(f"\nClass distribution:")
print(y_train.value_counts())
print(f"\nWin rate: {y_train.mean():.2%}")

# Calculate class weights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"\nClass weights: {class_weight_dict}")


Class distribution:
won_deal
1    5369
0    1671
Name: count, dtype: int64

Win rate: 76.26%

Class weights: {0: np.float64(2.106523040095751), 1: np.float64(0.6556155708698082)}


### 10. Train Models

In [25]:
# Define models with class balancing
models = {
    'Logistic Regression': LogisticRegression(
        max_iter=2000, 
        random_state=42,
        class_weight='balanced',
        C=0.5
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        random_state=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=200,
        max_depth=5,
        learning_rate=0.05,
        scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
        random_state=42,
        eval_metric='logloss'
    )
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'model': model,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  ROC AUC:   {roc_auc:.4f}")


Training Logistic Regression...
  Accuracy:  0.8284
  Precision: 0.9241
  Recall:    0.8443
  F1 Score:  0.8824
  ROC AUC:   0.8613

Training Random Forest...
  Accuracy:  0.9449
  Precision: 0.9356
  Recall:    0.9963
  F1 Score:  0.9650
  ROC AUC:   0.9664

Training Gradient Boosting...
  Accuracy:  0.9585
  Precision: 0.9510
  Recall:    0.9970
  F1 Score:  0.9734
  ROC AUC:   0.9714

Training XGBoost...
  Accuracy:  0.9466
  Precision: 0.9535
  Recall:    0.9776
  F1 Score:  0.9654
  ROC AUC:   0.9625


### 11. Model Comparison and Best Model

In [28]:
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [results[m]['accuracy'] for m in results.keys()],
    'Precision': [results[m]['precision'] for m in results.keys()],
    'Recall': [results[m]['recall'] for m in results.keys()],
    'F1 Score': [results[m]['f1'] for m in results.keys()],
    'ROC AUC': [results[m]['roc_auc'] for m in results.keys()]
})

print("\n" + comparison_df.to_string(index=False))

# Find best model by F1 score (balance of precision and recall)
best_model_name = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Model']
best_model = results[best_model_name]['model']

print(f"\nBest model: {best_model_name}")
print(f"   F1 Score: {results[best_model_name]['f1']:.4f}")
print(f"   ROC AUC: {results[best_model_name]['roc_auc']:.4f}")

print("\n" + "="*70)
print(f"Detailed Evaluation - {best_model_name}")
print("="*70)

y_pred_best = results[best_model_name]['y_pred']
y_pred_proba_best = results[best_model_name]['y_pred_proba']

# Classification Report
print("\n Classification Report:")
print(classification_report(y_test, y_pred_best, 
                          target_names=['Lost', 'Won'],
                          digits=3))

# Confusion Matrix
print("\n Confusion Matrix:")
cm = confusion_matrix(y_test, y_pred_best)
print(cm)
print("\nInterpretation:")
print(f"  True Negatives (Correctly predicted Lost):  {cm[0,0]}")
print(f"  False Positives (Predicted Won, Actually Lost): {cm[0,1]}")
print(f"  False Negatives (Predicted Lost, Actually Won): {cm[1,0]}")
print(f"  True Positives (Correctly predicted Won):   {cm[1,1]}")

# Calculate metrics
total_won = cm[1,0] + cm[1,1]
total_lost = cm[0,0] + cm[0,1]
print(f"  Total actual wins: {total_won}")
print(f"  Wins we caught: {cm[1,1]} ({cm[1,1]/total_won*100:.1f}%)")
print(f"  Wins we missed: {cm[1,0]} ({cm[1,0]/total_won*100:.1f}% - opportunity cost)")
print(f"  False alarms: {cm[0,1]} ({cm[0,1]/total_lost*100:.1f}% of losses)")


              Model  Accuracy  Precision   Recall  F1 Score  ROC AUC
Logistic Regression  0.828409   0.924144 0.844262  0.882399 0.861299
      Random Forest  0.944886   0.935619 0.996274  0.964995 0.966416
  Gradient Boosting  0.958523   0.950959 0.997019  0.973445 0.971439
            XGBoost  0.946591   0.953488 0.977645  0.965416 0.962549

Best model: Gradient Boosting
   F1 Score: 0.9734
   ROC AUC: 0.9714

Detailed Evaluation - Gradient Boosting

 Classification Report:
              precision    recall  f1-score   support

        Lost      0.989     0.835     0.905       418
         Won      0.951     0.997     0.973      1342

    accuracy                          0.959      1760
   macro avg      0.970     0.916     0.939      1760
weighted avg      0.960     0.959     0.957      1760


 Confusion Matrix:
[[ 349   69]
 [   4 1338]]

Interpretation:
  True Negatives (Correctly predicted Lost):  349
  False Positives (Predicted Won, Actually Lost): 69
  False Negatives (Predi

### 12. Cross Validation

In [29]:
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, 
                                cv=5, scoring='f1', n_jobs=-1)
    print(f"\n{name}:")
    print(f"  Mean F1: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  Individual folds: {[f'{s:.3f}' for s in cv_scores]}")


Logistic Regression:
  Mean F1: 0.8827 (+/- 0.0167)
  Individual folds: ['0.879', '0.873', '0.878', '0.896', '0.888']

Random Forest:
  Mean F1: 0.9621 (+/- 0.0048)
  Individual folds: ['0.958', '0.961', '0.964', '0.965', '0.961']

Gradient Boosting:
  Mean F1: 0.9666 (+/- 0.0027)
  Individual folds: ['0.965', '0.967', '0.967', '0.969', '0.966']

XGBoost:
  Mean F1: 0.9550 (+/- 0.0062)
  Individual folds: ['0.952', '0.956', '0.958', '0.950', '0.958']


### 13. Save Model

In [30]:
import pickle

# Save best model
model_filename = f'best_model_{best_model_name.replace(" ", "_").lower()}.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(best_model, f)
print(f"Best model saved as: {model_filename}")

Best model saved as: best_model_gradient_boosting.pkl
