# Modeling

In [None]:
# =========================================
# 1) IMPORT ALL REQUIRED LIBRARIES
# =========================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# scikit-learn / imblearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# xgboost, ensure xgboost is installed
from xgboost import XGBClassifier


url = '/Users/Roby/MLFieldWork/DF_Model.csv'
random_state = 42
test_size = 0.3

# =========================================
# 2) INTRODUCTION & MOTIVATION
# =========================================

GOAL:
 - We have transaction-level data (DF_Model.csv) describing clients.
 - Our objective: Identify which clients have the highest propensity to be 
   reactivated after 2 years without purchases.

PLAN:
 1) Read the CSV.
 2) Explore data & note shape, columns, types.
 3) Convert from transaction-level to client-level (one row per client).
 4) Split into train/test sets.
 5) Build multiple models: Logistic, Random Forest, XGBoost.
    - We'll tune each model specifically for precision or recall via GridSearch.
 6) Compare results on test set for precision, recall, and F1 via bar charts.
 7) Discuss which approach is 'best' based on business needs.


In [None]:
# =========================================
# 3) DATA LOADING
# =========================================
df = pd.read_csv(url)
df.head()

In [None]:
# =========================================
# 4) AGGREGATING TO CLIENT-LEVEL
# =========================================
# We assume 'client_id' is the unique identifier for each client.
# We'll create one row per client with aggregated features,
# plus a target that is 1 if they were reactivated, 0 otherwise.

df_agg = df.groupby('client_id', as_index=False).agg({
    'region': 'first',           # first known region
    'trade_sector': 'first',     # could also keep as numeric / transform
    'n_employees': 'mean',
    'economic_pot': 'mean',      # average potential
    'eco_pot_class': 'first',
    'risk_cat': 'first',
    'net': 'mean',                # mean net across all transactions
    'target': 'max'              # if any transaction had target=1 => 1
})


print("Aggregated Data Shape:", df_agg.shape)
print("Sample Aggregated Rows:")
display(df_agg.head())


In [None]:
# =========================================
# 5) TRAIN/TEST SPLIT & BASIC PREPROCESSING
# =========================================

# X = all features except client_id & target
X = df_agg.drop(['client_id','target'], axis=1)
y = df_agg['target']

# Split data (stratify helps preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=test_size, 
    random_state=random_state,
    stratify=y
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Identify numeric vs. categorical columns
# (You can refine these choices.)
numeric_cols = ['n_employees','economic_pot','net']
cat_cols = ['region','eco_pot_class','risk_cat']  # trade_sector could be included if desired

# Preprocessing pipelines
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, cat_cols)
])

print("Preprocessing set up. We can now feed it into a pipeline.")


In [None]:
# =========================================
# 6) CLASS IMBALANCE & SCORING FOR PRECISION vs. RECALL
# =========================================
# We'll do two sets of GridSearch for each model:
#  1) 'Precision' => scoring=precision_scorer
#  2) 'Recall'    => scoring=recall_scorer
# We'll use class_weight or scale_pos_weight to handle imbalance 
# while preserving the original data distribution.

precision_scorer = make_scorer(precision_score, pos_label=1)
recall_scorer    = make_scorer(recall_score,    pos_label=1)

results = []

In [None]:
# =========================================
# 7) MODEL 1: LOGISTIC REGRESSION
# =========================================

pipe_log = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(solver='lbfgs', max_iter=1000, random_state=42))
])

param_log = {
    "clf__C": [0.1, 1.0, 10],
    "clf__class_weight": [None, "balanced"]  # Could also try dict like {0:1,1:5} etc.
}

# --- (A) Optimize for PRECISION ---
grid_log_prec = GridSearchCV(
    estimator=pipe_log,
    param_grid=param_log,
    scoring=precision_scorer,
    cv=3
)
grid_log_prec.fit(X_train, y_train)

best_log_prec = grid_log_prec.best_estimator_
y_pred_log_prec = best_log_prec.predict(X_test)

res_log_prec = {
    "model": "Logistic(Precision)",
    "precision": precision_score(y_test, y_pred_log_prec),
    "recall":    recall_score(y_test, y_pred_log_prec),
    "f1":        f1_score(y_test, y_pred_log_prec)
}
results.append(res_log_prec)

# --- (B) Optimize for RECALL ---
grid_log_rec = GridSearchCV(
    estimator=pipe_log,
    param_grid=param_log,
    scoring=recall_scorer,
    cv=3
)
grid_log_rec.fit(X_train, y_train)

best_log_rec = grid_log_rec.best_estimator_
y_pred_log_rec = best_log_rec.predict(X_test)

res_log_rec = {
    "model": "Logistic(Recall)",
    "precision": precision_score(y_test, y_pred_log_rec),
    "recall":    recall_score(y_test, y_pred_log_rec),
    "f1":        f1_score(y_test, y_pred_log_rec)
}
results.append(res_log_rec)

print("Done training Logistic for precision & recall.")


In [None]:
# =========================================
# 8) MODEL 2: RANDOM FOREST
# =========================================

pipe_rf = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(random_state=42))
])

param_rf = {
    "clf__n_estimators": [50, 100],
    "clf__max_depth": [3, 5, 10],
    "clf__class_weight": [None, "balanced"]
}

# --- (A) Optimize for PRECISION ---
grid_rf_prec = GridSearchCV(
    pipe_rf,
    param_rf,
    scoring=precision_scorer,
    cv=3
)
grid_rf_prec.fit(X_train, y_train)

best_rf_prec = grid_rf_prec.best_estimator_
y_pred_rf_prec = best_rf_prec.predict(X_test)

res_rf_prec = {
    "model": "RF(Precision)",
    "precision": precision_score(y_test, y_pred_rf_prec),
    "recall":    recall_score(y_test, y_pred_rf_prec),
    "f1":        f1_score(y_test, y_pred_rf_prec)
}
results.append(res_rf_prec)

# --- (B) Optimize for RECALL ---
grid_rf_rec = GridSearchCV(
    pipe_rf,
    param_rf,
    scoring=recall_scorer,
    cv=3
)
grid_rf_rec.fit(X_train, y_train)

best_rf_rec = grid_rf_rec.best_estimator_
y_pred_rf_rec = best_rf_rec.predict(X_test)

res_rf_rec = {
    "model": "RF(Recall)",
    "precision": precision_score(y_test, y_pred_rf_rec),
    "recall":    recall_score(y_test, y_pred_rf_rec),
    "f1":        f1_score(y_test, y_pred_rf_rec)
}
results.append(res_rf_rec)

print("Done training Random Forest for precision & recall.")


In [None]:
# =========================================
# 9) MODEL 3: XGBOOST
# =========================================
# We'll use scale_pos_weight for imbalance.

pipe_xgb = Pipeline([
    ("prep", preprocessor),
    ("clf", XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42))
])

param_xgb = {
    "clf__n_estimators": [50, 100],
    "clf__max_depth": [3, 5],
    "clf__scale_pos_weight": [1, 5, 10]  # pick some approximate ratio or small range
}

# --- (A) Optimize for PRECISION ---
grid_xgb_prec = GridSearchCV(
    pipe_xgb,
    param_xgb,
    scoring=precision_scorer,
    cv=3
)
grid_xgb_prec.fit(X_train, y_train)

best_xgb_prec = grid_xgb_prec.best_estimator_
y_pred_xgb_prec = best_xgb_prec.predict(X_test)

res_xgb_prec = {
    "model": "XGB(Precision)",
    "precision": precision_score(y_test, y_pred_xgb_prec),
    "recall":    recall_score(y_test, y_pred_xgb_prec),
    "f1":        f1_score(y_test, y_pred_xgb_prec)
}
results.append(res_xgb_prec)

# --- (B) Optimize for RECALL ---
grid_xgb_rec = GridSearchCV(
    pipe_xgb,
    param_xgb,
    scoring=recall_scorer,
    cv=3
)
grid_xgb_rec.fit(X_train, y_train)

best_xgb_rec = grid_xgb_rec.best_estimator_
y_pred_xgb_rec = best_xgb_rec.predict(X_test)

res_xgb_rec = {
    "model": "XGB(Recall)",
    "precision": precision_score(y_test, y_pred_xgb_rec),
    "recall":    recall_score(y_test, y_pred_xgb_rec),
    "f1":        f1_score(y_test, y_pred_xgb_rec)
}
results.append(res_xgb_rec)

print("Done training XGBoost for precision & recall.")


In [None]:
# =========================================
# 10) COLLECT & VIEW RESULTS
# =========================================
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="model").reset_index(drop=True)
print("FINAL COMPARISON (Test Set):")
display(results_df)


In [None]:
# =========================================
# 11) PLOT SIMPLE BAR CHARTS
# =========================================
models = results_df['model'].values
precisions = results_df['precision'].values
recalls = results_df['recall'].values
f1s = results_df['f1'].values

# -- Bar Chart for Precision --
plt.figure()
plt.bar(models, precisions)
plt.title('Precision by Model')
plt.xlabel('Model')
plt.ylabel('Precision')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# -- Bar Chart for Recall --
plt.figure()
plt.bar(models, recalls)
plt.title('Recall by Model')
plt.xlabel('Model')
plt.ylabel('Recall')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# -- Bar Chart for F1 --
plt.figure()
plt.bar(models, f1s)
plt.title('F1 Score by Model')
plt.xlabel('Model')
plt.ylabel('F1')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# =========================================
# 12) EXPLANATION & CONCLUSION
# =========================================
 
INTERPRETING THE RESULTS:
 - We have 6 total models: 
    1) Logistic(Precision),  2) Logistic(Recall),
    3) RF(Precision),        4) RF(Recall),
    5) XGB(Precision),       6) XGB(Recall).
 
 - Models tuned for Precision generally yield higher precision (fewer false positives)
   but lower recall (more false negatives).
 - Models tuned for Recall generally yield higher recall but often lower precision.
 
 - Which approach is 'best' depends on business goals. If it's critical not to miss
   potential reactivations, choose a high-recall model. If it's expensive to contact
   false-positives, choose a high-precision model. 
 - F1 or PR-AUC can provide a balanced perspective if you value both equally.
 
 - Random Forest / XGBoost often perform better than Logistic Regression on complex data,
   but Logistic is simpler and can be more interpretable.


### ✅ Evaluation Summary

We trained and evaluated six models — two for each algorithm (Logistic Regression, Random Forest, XGBoost), optimizing one version for **precision**, and the other for **recall**. Here's a breakdown of how each model performed on the **test set**:

| Model               | Precision | Recall   | F1       |
|---------------------|-----------|----------|----------|
| Logistic(Precision) | 0.191     | 0.577    | 0.287    |
| Logistic(Recall)    | 0.189     | 0.578    | 0.285    |
| RF(Precision)       | 0.216     | 0.522    | **0.306**|
| RF(Recall)          | 0.215     | 0.521    | 0.304    |
| XGB(Precision)      | 0.000     | 0.000    | 0.000    |
| XGB(Recall)         | 0.157     | **0.840**| 0.265    |

---

### 🧠 Interpretation & Comments

#### 🔍 General Observations

- The **best overall F1 score** was achieved by **Random Forest (Precision)**: `F1 = 0.306`.
- **XGBoost (Precision)** completely failed (`precision = recall = f1 = 0.000`), likely due to:
  - Poor parameter choice (e.g. `scale_pos_weight` too high or irrelevant features).
  - An overfitting or underfitting issue during training.
- **XGBoost (Recall)** got an extremely **high recall** (`0.84`), meaning it successfully captured most positive cases but sacrificed precision.

---

#### 🏆 Best Models by Objective

| Goal                | Best Model        | Why                                           |
|---------------------|-------------------|-----------------------------------------------|
| **High Precision**  | RF(Precision)     | Highest precision `0.216` + best F1 `0.306`   |
| **High Recall**     | XGB(Recall)       | Highest recall `0.84`                         |
| **Balanced (F1)**   | RF(Precision)     | Best balance of precision and recall          |

---

### 🧪 What to Do Next

#### 1. **Fix XGBoost(Precision)**
- XGB’s failure in precision-optimization suggests it may not have learned anything useful.
- Try tuning `scale_pos_weight`, or even test XGBoost with `eval_metric="aucpr"` and early stopping.

#### 2. **Consider Threshold Tuning**
- Especially for **XGB(Recall)**: you can reduce false positives by increasing the decision threshold (default = 0.5).
- This might improve its **precision**, and increase the F1 to match or beat Random Forest.

#### 3. **Feature Engineering**
- All models show precision < 0.22 → false positives are a concern.
- Consider adding:
  - Time-based features (e.g. last purchase gap)
  - Interaction terms (region × sector, risk × economic potential)
  - Frequency/monetary variables per client (RFM features)

#### 4. **Calibration**
- Try using **calibrated probabilities** (e.g. with `CalibratedClassifierCV`) to make better threshold decisions.

---

### 🧠 Final Thoughts

- Random Forest is your **most stable and robust performer**.
- XGBoost can give exceptional **recall**, but requires care (and currently needs fixing for precision).
- Logistic Regression performs reasonably, but struggles with non-linearity — consistent with expectations.

In [None]:
# Replace previous param_xgb with a more robust grid
param_xgb_fixed = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [3, 5, 7],
    "clf__learning_rate": [0.01, 0.1],
    "clf__scale_pos_weight": [1, 3, 5, 10]
}

# Retry precision-optimized XGBoost
grid_xgb_prec_fixed = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_xgb_fixed,
    scoring=precision_scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid_xgb_prec_fixed.fit(X_train, y_train)

best_xgb_prec_fixed = grid_xgb_prec_fixed.best_estimator_
y_pred_xgb_prec_fixed = best_xgb_prec_fixed.predict(X_test)

res_xgb_prec_fixed = {
    "model": "XGB(Precision*)",  # mark as improved
    "precision": precision_score(y_test, y_pred_xgb_prec_fixed),
    "recall":    recall_score(y_test, y_pred_xgb_prec_fixed),
    "f1":        f1_score(y_test, y_pred_xgb_prec_fixed)
}
results.append(res_xgb_prec_fixed)


In [None]:
# Get predicted probabilities for positive class
y_probs = best_xgb_rec.predict_proba(X_test)[:, 1]

# Sweep thresholds from 0.1 to 0.9
thresholds = np.arange(0.1, 0.91, 0.05)
precision_scores = []
recall_scores = []
f1_scores = []

for t in thresholds:
    y_pred_thresh = (y_probs >= t).astype(int)
    precision_scores.append(precision_score(y_test, y_pred_thresh))
    recall_scores.append(recall_score(y_test, y_pred_thresh))
    f1_scores.append(f1_score(y_test, y_pred_thresh))

# Plot
plt.figure(figsize=(10, 5))
plt.plot(thresholds, precision_scores, label="Precision")
plt.plot(thresholds, recall_scores, label="Recall")
plt.plot(thresholds, f1_scores, label="F1 Score")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("XGB(Recall) - Threshold Tuning")
plt.legend()
plt.grid(True)
plt.show()


This will show three curves:
- **Precision**: how "clean" the predictions are (fewer false positives)
- **Recall**: how many true positives you captured
- **F1 Score**: the harmonic mean (balance) of precision and recall

---

## 📊 How to Read the Graph

- **X-axis**: classification **threshold** (from 0.1 to 0.9)
- **Y-axis**: score for each metric (0–1)
- You’ll see:
  - As threshold increases:
    - **Precision goes up** (you only take high-confidence clients)
    - **Recall goes down** (you miss many true reactivations)
    - **F1 score** peaks somewhere in the middle

---

### 🔍 Example Interpretation:

| Threshold | Precision | Recall | F1      | Meaning                                             |
|-----------|-----------|--------|---------|-----------------------------------------------------|
| 0.25      | 0.11      | 0.92   | 0.20    | Very aggressive: catches many, but lots of false pos |
| 0.50      | 0.16      | 0.84   | 0.27    | Default value; moderate recall/precision             |
| 0.75      | 0.30      | 0.40   | 0.34    | More cautious; better quality, fewer reactivations   |
| 0.90      | 0.60      | 0.10   | 0.17    | Very conservative; only the most confident clients   |

You pick a **threshold that fits your strategy**:
- Want more leads? Pick a **low threshold** → high recall.
- Want only high-quality leads? Pick a **high threshold** → high precision.
- Want a balanced trade-off? Pick the **threshold with highest F1**.

In [None]:
# Pick best threshold based on tradeoff
best_thresh = 0.45  # for example
y_pred_adjusted = (y_probs >= best_thresh).astype(int)

adjusted_result = {
    "model": f"XGB(Recall_Tuned@{best_thresh})",
    "precision": precision_score(y_test, y_pred_adjusted),
    "recall": recall_score(y_test, y_pred_adjusted),
    "f1": f1_score(y_test, y_pred_adjusted)
}
results.append(adjusted_result)


In [None]:
# Create updated comparison DataFrame
results_df = pd.DataFrame(results).sort_values(by="f1", ascending=False)

# Plot new scores
for metric in ["precision", "recall", "f1"]:
    plt.figure()
    plt.bar(results_df["model"], results_df[metric])
    plt.title(f"{metric.capitalize()} by Model")
    plt.xticks(rotation=45, ha="right")
    plt.ylabel(metric.capitalize())
    plt.tight_layout()
    plt.show()


## 📊 How to Read Each Graph

### 🔷 Precision by Model

- **X-axis**: Each bar represents a model.
- **Y-axis**: How many of the predicted “reactivated” clients were actually reactivated.
- **Higher is better** *only* if false positives are expensive for you (e.g. costly to contact clients who won’t convert).

✅ Use this graph if your business wants to:
> Focus effort only where you're highly confident the client will reactivate.

---

### 🔷 Recall by Model

- **X-axis**: Each model again.
- **Y-axis**: Out of all true reactivatable clients, how many did the model catch?
- **Higher is better** if **missing opportunities is costly**, even if it means you get some wrong ones too.

✅ Use this graph if your business wants to:
> Maximize how many real reactivations you catch (you can tolerate contacting some wrong clients).

---

### 🔷 F1 Score by Model

- **X-axis**: Models.
- **Y-axis**: F1 score = harmonic mean of precision and recall.
- This is a **balanced metric**, useful when you want a **trade-off** between catching many real cases and not wasting resources.

✅ Use this if:
> You want a well-rounded model that performs “solidly” across both goals.

---

## 🧠 Example Use-Case Summary

| Graph               | Use it when...                                                 | What to look for                          |
|---------------------|----------------------------------------------------------------|--------------------------------------------|
| Precision by Model  | You want **quality > quantity** of predictions                | Look for highest bars on this chart        |
| Recall by Model     | You want to catch **as many reactivatable clients as possible** | Focus on models with highest recall bars   |
| F1 Score by Model   | You want a **balanced decision-making tool**                   | Pick the best overall-performing model     |

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="model").reset_index(drop=True)
print("FINAL COMPARISON (Test Set):")
display(results_df)

### 🔍 Interpretation of New Results

We now have **8 models** including:

- The original 6 (Logistic, RF, XGB × Precision/Recall)
- Plus:
  - ✅ `XGB(Precision*)`: retrained for precision
  - ✅ `XGB(Recall_Tuned@0.45)`: recall-optimized + threshold tuning at 0.45

---

### 📊 Updated Results Summary

| Model                  | Precision | Recall   | F1       |
|------------------------|-----------|----------|----------|
| **RF(Precision)**      | **0.216** | 0.522    | **0.306**|
| RF(Recall)             | 0.215     | 0.521    | 0.304    |
| Logistic(Precision)    | 0.191     | 0.577    | 0.287    |
| Logistic(Recall)       | 0.189     | 0.578    | 0.285    |
| **XGB(Recall)**        | 0.157     | 0.840    | 0.265    |
| **XGB(Recall_Tuned@0.45)** | 0.151 | **0.889**| 0.257    |
| XGB(Precision)         | 0.000     | 0.000    | 0.000    |
| XGB(Precision*)        | 0.000     | 0.000    | 0.000    |

---

### 🧠 Key Takeaways

#### ✅ 1. **Best All-Around Model**
- **Random Forest (Precision)** still leads overall in **F1 (0.306)** and has the best **precision** (0.216).
- This is a balanced choice if you care about catching good clients *without too many false positives*.

#### 📈 2. **High Recall Performer**
- **XGB(Recall_Tuned@0.45)** achieved **excellent recall** at `0.889`, which means it's catching almost 9 out of 10 real reactivation cases.
- Precision is **low** at `0.151`, so **many false positives**.
- Use this **if missing a reactivatable client is very costly**.

#### ⚠️ 3. **XGB Precision Failure**
- Both `XGB(Precision)` and `XGB(Precision*)` failed completely.
- Likely causes:
  - Poor feature interaction for XGB
  - Model collapse due to misfit or misconfiguration (e.g. bad `scale_pos_weight`, tree depth, or constant labels)
- Consider dropping XGB for precision optimization or debugging with a simplified dataset.

---

### ✅ Business-Driven Decision

| Business Goal                            | Recommended Model           | Why                                               |
|------------------------------------------|-----------------------------|----------------------------------------------------|
| Maximize reactivation recall (catch all) | `XGB(Recall_Tuned@0.45)`    | Best recall (0.889), very few missed opportunities |
| Best balance of precision & recall       | `RF(Precision)`             | Best F1 (0.306), good precision (0.216)            |
| Precision-focused strategy               | `RF(Precision)`             | Best available precision + overall performance     |
| Avoid XGBoost for now                    | ✖ `XGB(Precision*)`         | Doesn't learn under current settings               |

---

### 📌 Recommendations

1. **Use `RF(Precision)`** as the default model for now.
2. Deploy **`XGB(Recall_Tuned@0.45)`** only for high-risk segments, or as part of a second-tier filter.
3. Skip or re-tune `XGB(Precision)` from scratch (check data imbalance, labels, params).
4. Consider combining both approaches in a **two-stage model**:
   - Stage 1: High-recall model for broad filtering
   - Stage 2: High-precision model for final decision