In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/cleaned_activities.csv")

df_train = df[
    (df["opportunity_id"] != "no_opp") &
    (df["opportunity_stage"].isin(["Won", "Lost"]))
].copy()

df_train = df_train.sort_values(["opportunity_id", "activity_order"])
print(df_train["opportunity_stage"].value_counts())


opportunity_stage
Won     14997
Lost    12579
Name: count, dtype: int64


In [2]:
N = 6

# نجمع الـtypes كقائمة حسب opportunity
journeys = (
    df_train
    .groupby(["opportunity_id", "account_id", "Country", "solution", "opportunity_stage"])["types"]
    .apply(list)
    .reset_index(name="seq")
)

def to_fixed(seq, n=N):
    seq = seq[:n]
    return seq + ["<PAD>"] * (n - len(seq))

fixed = journeys["seq"].apply(to_fixed)

X = pd.DataFrame(fixed.tolist(), columns=[f"step_{i}" for i in range(1, N+1)])
# (إضافة Country و solution كميزات:
X["Country"] = journeys["Country"].values
X["solution"] = journeys["solution"].values

y = journeys["opportunity_stage"].map({"Lost": 0, "Won": 1}).values

print(X.head())
print("rows:", X.shape[0])


    step_1   step_2   step_3   step_4   step_5   step_6 Country solution
0    Email    <PAD>    <PAD>    <PAD>    <PAD>    <PAD>      US      MRS
1    Email    Email    <PAD>    <PAD>    <PAD>    <PAD>      US      MRS
2  Meeting    <PAD>    <PAD>    <PAD>    <PAD>    <PAD>      US      MRS
3  Meeting    <PAD>    <PAD>    <PAD>    <PAD>    <PAD>      US      MRS
4  Meeting  Meeting  Meeting  Meeting  Meeting  Meeting      US      MRS
rows: 5977


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

cat_cols = X.columns.tolist()

pre = ColumnTransformer(
    transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)]
)


clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=100,
    min_samples_split=200,
    random_state=42
)

pipe = Pipeline([("pre", pre), ("clf", clf)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipe.fit(X_train, y_train)

pred = pipe.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred, target_names=["Lost","Won"]))


[[379 228]
 [314 275]]
              precision    recall  f1-score   support

        Lost       0.55      0.62      0.58       607
         Won       0.55      0.47      0.50       589

    accuracy                           0.55      1196
   macro avg       0.55      0.55      0.54      1196
weighted avg       0.55      0.55      0.54      1196



In [4]:
import numpy as np
import pandas as pd

# 1) نحول X_train بنفس الـpreprocessor
Xtr_enc = pipe.named_steps["pre"].transform(X_train)

# 2) نحصل على leaf_id لكل صف تدريب
leaf_id_train = pipe.named_steps["clf"].apply(Xtr_enc)

tmp = pd.DataFrame({
    "leaf_id": leaf_id_train,
    "y": y_train
})

# 3) احصائيات كل leaf: الدعم + نسبة الفوز
leaf_summary = (
    tmp.groupby("leaf_id")
       .agg(
           support=("y","size"),
           win_rate=("y","mean")
       )
       .reset_index()
)

leaf_summary["loss_rate"] = 1 - leaf_summary["win_rate"]

# 4) حد دعم ديناميكي
MIN_SUPPORT = max(30, int(0.02 * len(y_train)))
print("MIN_SUPPORT =", MIN_SUPPORT)

cand = leaf_summary[leaf_summary["support"] >= MIN_SUPPORT].copy()
print("عدد الأوراق المؤهلة =", cand.shape[0])

# fallback إذا ما في أوراق مؤهلة
if cand.empty:
    print("تنبيه: لا توجد أوراق تحقق MIN_SUPPORT. سيتم تقليل الحد للنصف.")
    MIN_SUPPORT = max(10, int(0.01 * len(y_train)))
    cand = leaf_summary[leaf_summary["support"] >= MIN_SUPPORT].copy()

best_win_leaf_id  = int(cand.sort_values(["win_rate","support"], ascending=False).iloc[0]["leaf_id"])
best_loss_leaf_id = int(cand.sort_values(["loss_rate","support"], ascending=False).iloc[0]["leaf_id"])

print("Best WIN leaf_id:", best_win_leaf_id)
print("Best LOSS leaf_id:", best_loss_leaf_id)

#  اعرض أعلى 10 أوراق للفوز والخسارة
display(cand.sort_values(["win_rate","support"], ascending=False).head(10))
display(cand.sort_values(["loss_rate","support"], ascending=False).head(10))



MIN_SUPPORT = 95
عدد الأوراق المؤهلة = 15
Best WIN leaf_id: 17
Best LOSS leaf_id: 26


Unnamed: 0,leaf_id,support,win_rate,loss_rate
8,17,102,0.754902,0.245098
9,19,157,0.592357,0.407643
0,5,677,0.586411,0.413589
5,13,179,0.575419,0.424581
2,8,115,0.547826,0.452174
14,28,202,0.509901,0.490099
4,12,566,0.508834,0.491166
11,24,256,0.5,0.5
1,6,106,0.490566,0.509434
7,16,210,0.480952,0.519048


Unnamed: 0,leaf_id,support,win_rate,loss_rate
12,26,188,0.324468,0.675532
3,9,113,0.433628,0.566372
10,23,1568,0.433673,0.566327
6,15,104,0.451923,0.548077
13,27,238,0.478992,0.521008
7,16,210,0.480952,0.519048
1,6,106,0.490566,0.509434
11,24,256,0.5,0.5
4,12,566,0.508834,0.491166
14,28,202,0.509901,0.490099


In [5]:
import numpy as np
import pandas as pd


step_cols = [f"step_{i}" for i in range(1, N+1)]

# نحصل leaf_id لكل صف تدريب 
Xtr_enc = pipe.named_steps["pre"].transform(X_train)
leaf_id_train = pipe.named_steps["clf"].apply(Xtr_enc)

X_train_leaf = X_train.copy()
X_train_leaf["leaf_id"] = leaf_id_train

def best_trip_from_leaf(df_leaf, step_cols, pad="<PAD>"):
    trip = []
    for c in step_cols:
        s = df_leaf[c]
        s = s[s != pad]  # تجاهل PAD
        if s.empty:
            trip.append(None)
        else:
            trip.append(s.mode().iloc[0])  # الأكثر تكراراً
    return trip

win_leaf_rows  = X_train_leaf[X_train_leaf["leaf_id"] == best_win_leaf_id].copy()
loss_leaf_rows = X_train_leaf[X_train_leaf["leaf_id"] == best_loss_leaf_id].copy()

print("WIN leaf support:", win_leaf_rows.shape[0])
print("LOSS leaf support:", loss_leaf_rows.shape[0])

win_trip  = best_trip_from_leaf(win_leaf_rows, step_cols)
loss_trip = best_trip_from_leaf(loss_leaf_rows, step_cols)

print("Best WIN trip:", win_trip)
print("Best LOSS trip:", loss_trip)

#  اعرض أكثر 3 أفعال في كل خطوة داخل leaf
def top_actions_per_step(df_leaf, step_cols, topk=3, pad="<PAD>"):
    out = {}
    for c in step_cols:
        vc = df_leaf.loc[df_leaf[c] != pad, c].value_counts().head(topk)
        out[c] = vc
    return out

print("\nTop actions per step (WIN leaf):")
for k,v in top_actions_per_step(win_leaf_rows, step_cols).items():
    print(k, "=>", v.to_dict())

print("\nTop actions per step (LOSS leaf):")
for k,v in top_actions_per_step(loss_leaf_rows, step_cols).items():
    print(k, "=>", v.to_dict())




WIN leaf support: 102
LOSS leaf support: 188
Best WIN trip: ['Email', 'Email', 'Email', 'Email', 'Email', 'Email']
Best LOSS trip: ['Follow Up', 'On-Site', None, None, None, None]

Top actions per step (WIN leaf):
step_1 => {'Email': 57, 'Meeting': 23, 'Follow Up': 14}
step_2 => {'Email': 57, 'Follow Up': 16, 'Meeting': 16}
step_3 => {'Email': 48, 'Meeting': 24, 'Follow Up': 19}
step_4 => {'Email': 43, 'Meeting': 20, 'Follow Up': 7}
step_5 => {'Email': 43, 'Meeting': 10, 'Follow Up': 10}
step_6 => {'Email': 29, 'Follow Up': 18, 'Meeting': 7}

Top actions per step (LOSS leaf):
step_1 => {'Follow Up': 79, 'Meeting': 48, 'Email': 33}
step_2 => {'On-Site': 13, 'Follow Up': 8, 'Email': 6}
step_3 => {}
step_4 => {}
step_5 => {}
step_6 => {}
