In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler

In [17]:
df = pd.read_csv('data/train_storming_round.csv')
df['target'] = (df['new_policy_count'] > 0).astype(int)

In [18]:

missing = df.isnull().sum().to_frame('missing_count')
missing['missing_pct'] = (missing['missing_count'] / len(df)) * 100
display('Missing Values Summary', missing)


'Missing Values Summary'

Unnamed: 0,missing_count,missing_pct
row_id,0,0.0
agent_code,0,0.0
agent_age,0,0.0
agent_join_month,0,0.0
first_policy_sold_month,0,0.0
year_month,0,0.0
unique_proposals_last_7_days,0,0.0
unique_proposals_last_15_days,0,0.0
unique_proposals_last_21_days,0,0.0
unique_proposal,0,0.0


In [19]:

numeric = df.select_dtypes(include='number').drop(columns=['row_id', ], errors='ignore')
quantiles = numeric.quantile([0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1]).T
display('Numeric Feature Quantiles', quantiles)


'Numeric Feature Quantiles'

Unnamed: 0,0.00,0.01,0.05,0.25,0.50,0.75,0.95,0.99,1.00
agent_age,20.0,20.0,21.0,31.0,41.0,51.0,58.0,60.0,60.0
unique_proposals_last_7_days,0.0,0.0,0.0,0.0,1.0,2.0,3.0,3.0,3.0
unique_proposals_last_15_days,0.0,0.0,0.0,1.0,3.0,5.0,6.0,6.0,6.0
unique_proposals_last_21_days,0.0,0.0,1.0,5.0,10.0,15.0,19.0,20.0,20.0
unique_proposal,1.0,4.0,7.0,12.0,17.0,23.0,28.0,31.0,34.0
unique_quotations_last_7_days,0.0,0.0,0.0,1.0,2.0,3.0,4.0,4.0,4.0
unique_quotations_last_15_days,0.0,0.0,0.0,1.0,3.0,5.0,6.0,6.0,6.0
unique_quotations_last_21_days,0.0,0.0,0.0,2.0,4.0,7.0,9.0,9.0,9.0
unique_quotations,1.0,5.0,7.0,11.0,14.0,17.0,22.0,26.0,32.0
unique_customers_last_7_days,0.0,0.0,0.0,2.0,3.0,4.0,6.0,6.0,6.0


In [20]:
numeric_cols = [
    'agent_age',
    'unique_proposals_last_7_days', 'unique_proposals_last_15_days', 'unique_proposals_last_21_days', 'unique_proposal',
    'unique_quotations_last_7_days', 'unique_quotations_last_15_days', 'unique_quotations_last_21_days', 'unique_quotations',
    'unique_customers_last_7_days', 'unique_customers_last_15_days', 'unique_customers_last_21_days', 'unique_customers',
    'ANBP_value', 'net_income', 'number_of_policy_holders', 'number_of_cash_payment_policies'
]

print("Missing values before imputation:")
print(df[numeric_cols].isnull().sum())

lower = df[numeric_cols].quantile(0.01)
upper = df[numeric_cols].quantile(0.99)
df[numeric_cols] = df[numeric_cols].clip(lower=lower, upper=upper, axis=1)

# print("\Before scaling:")
# print(df[numeric_cols].describe().T[['min','25%','50%','75%','max']])

# scaler = RobustScaler()
# df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# print("\nAfter cleaning and scaling:")
# print(df[numeric_cols].describe().T[['min','25%','50%','75%','max']])

Missing values before imputation:
agent_age                          0
unique_proposals_last_7_days       0
unique_proposals_last_15_days      0
unique_proposals_last_21_days      0
unique_proposal                    0
unique_quotations_last_7_days      0
unique_quotations_last_15_days     0
unique_quotations_last_21_days     0
unique_quotations                  0
unique_customers_last_7_days       0
unique_customers_last_15_days      0
unique_customers_last_21_days      0
unique_customers                   0
ANBP_value                         0
net_income                         0
number_of_policy_holders           0
number_of_cash_payment_policies    0
dtype: int64


In [21]:
# scaler = RobustScaler()
# type(numeric_cols)
# numeric_cols = scaler.fit_transform(numeric_cols)
# # df.describe().T[['min','25%','50%','75%','max']]
# type(numeric_cols)


In [39]:
for c in ['year_month','agent_join_month','first_policy_sold_month']:
    df[c + '_period'] = (
        pd.to_datetime(df[c],
                       infer_datetime_format=True,
                       errors='coerce')
          .dt.to_period('M')
    )

df['tenure_months'] = (
    df['year_month_period'] - df['agent_join_month_period']
).apply(lambda x: x.n)

df['months_to_first_sale'] = (
    df['first_policy_sold_month_period'] - df['agent_join_month_period']
).apply(lambda x: max(x.n, 0))

# 3) Recency ratios with zero-denominator guard
for w in (7, 15, 21):
    # proposals ratio
    denom = df['unique_proposal']
    df[f'prop_ratio_last_{w}d'] = np.where(
        denom == 0,
        0,
        df[f'unique_proposals_last_{w}_days'] / denom
    )
    # quotations ratio
    denom = df['unique_quotations']
    df[f'quot_ratio_last_{w}d'] = np.where(
        denom == 0,
        0,
        df[f'unique_quotations_last_{w}_days'] / denom
    )
    # customers ratio
    denom = df['unique_customers']
    df[f'cust_ratio_last_{w}d'] = np.where(
        denom == 0,
        0,
        df[f'unique_customers_last_{w}_days'] / denom
    )

# 4) Momentum with zero-denominator guard
den = df['unique_proposals_last_15_days']
df['prop_momentum'] = np.where(
    den == 0,
    0,
    (df['unique_proposals_last_7_days'] - df['unique_proposals_last_15_days']) / den
)

den = df['unique_quotations_last_15_days']
df['quot_momentum'] = np.where(
    den == 0,
    0,
    (df['unique_quotations_last_7_days'] - df['unique_quotations_last_15_days']) / den
)

den = df['unique_customers_last_15_days']
df['cust_momentum'] = np.where(
    den == 0,
    0,
    (df['unique_customers_last_7_days'] - df['unique_customers_last_15_days']) / den
)

# 5) Value-per-unit with zero-denominator guard
den = df['unique_proposal']
df['anbp_per_prop'] = np.where(
    den == 0,
    0,
    df['ANBP_value'] / den
)

den = df['new_policy_count']
df['income_per_policy'] = np.where(
    den == 0,
    0,
    df['net_income'] / den
)

# Behavioral flags and drop as before
df['no_props_last_7d'] = (df['unique_proposals_last_7_days'] == 0).astype(int)
df['no_quot_last_7d']  = (df['unique_quotations_last_7_days']   == 0).astype(int)
df['no_cust_last_7d']  = (df['unique_customers_last_7_days']     == 0).astype(int)

df.drop(columns=[
    'year_month_period',
    'agent_join_month_period',
    'first_policy_sold_month_period'
], inplace=True)

# print(df[['tenure_months','months_to_first_sale']].head())
# print(df.filter(regex='(ratio_last|momentum|per_)').head())


df2 = df.drop(["agent_join_month", "first_policy_sold_month", "year_month", "agent_code", "row_id", "anbp_per_prop"], axis=1)

  pd.to_datetime(df[c],
  pd.to_datetime(df[c],
  pd.to_datetime(df[c],


In [40]:
scaler = RobustScaler()
np_scaled = scaler.fit_transform(df2)

In [41]:
df2_scaled = pd.DataFrame(
    np_scaled,
    columns=df2.columns, 
    index=df2.index
)
df2_scaled.head()

Unnamed: 0,agent_age,unique_proposals_last_7_days,unique_proposals_last_15_days,unique_proposals_last_21_days,unique_proposal,unique_quotations_last_7_days,unique_quotations_last_15_days,unique_quotations_last_21_days,unique_quotations,unique_customers_last_7_days,...,prop_ratio_last_21d,quot_ratio_last_21d,cust_ratio_last_21d,prop_momentum,quot_momentum,cust_momentum,income_per_policy,no_props_last_7d,no_quot_last_7d,no_cust_last_7d
0,0.2,1.0,0.75,-0.8,-0.454545,0.5,-0.75,0.0,-0.833333,0.0,...,-1.381773,0.355336,0.256,0.0,0.444444,0.225,-0.771323,0.0,0.0,0.0
1,0.35,0.0,0.25,0.2,0.363636,0.5,-0.5,0.8,0.0,0.5,...,0.0,0.761433,0.519529,-0.3,3.111111,1.1,-0.148987,0.0,0.0,0.0
2,0.6,1.0,0.5,-0.5,0.0,-0.5,0.75,0.2,0.0,0.0,...,-0.946682,0.076143,0.210286,0.12,-0.666667,-0.257143,0.777981,0.0,0.0,0.0
3,0.15,0.0,-0.75,0.2,0.0,-0.5,-0.5,0.2,-0.5,-0.5,...,0.458997,0.387639,1.019077,0.6,0.444444,0.6,-0.520661,0.0,0.0,0.0
4,-1.05,-0.5,0.75,0.0,0.0,0.5,0.75,-0.4,-0.166667,0.0,...,0.057375,-0.574004,-0.788211,-0.6,-0.222222,-0.45,1.882601,1.0,0.0,0.0


In [42]:
X = df2_scaled.drop(columns=['target'])
y = df2_scaled['target']

In [43]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
selector.fit(X)
X_filtered = X[X.columns[selector.get_support()]]


In [44]:
corr = X_filtered.corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
X_uncorr = X_filtered.drop(columns=to_drop)

In [45]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def compute_vif(df):
    vif_data = pd.DataFrame({
        'feature': df.columns,
        'VIF': [variance_inflation_factor(df.values, i)
                for i in range(df.shape[1])]
    })
    return vif_data

vif_df = compute_vif(X_uncorr)
X_vif = X_uncorr.drop(columns=vif_df[vif_df['VIF'] > 10]['feature'].tolist())

In [46]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, f1_score, make_scorer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb

X = X_vif
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

lr = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict_proba(X_test)[:,1]
print("LR AUC:", roc_auc_score(y_test, y_pred_lr))



LR AUC: 1.0


In [47]:
#randomForest Learner
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_param_dist = {
    'n_estimators': [100, 300, 500, 800],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.5]
}
rf_search = RandomizedSearchCV(
    rf,
    rf_param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_
y_pred_rf = best_rf.predict_proba(X_test)[:,1]
print("RF best AUC:", roc_auc_score(y_test, y_pred_rf))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
RF best AUC: 1.0


In [48]:
#LightBGM Learner
lgbm = lgb.LGBMClassifier(class_weight='balanced', random_state=42)
lgb_param_dist = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300, 500],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
lgb_search = RandomizedSearchCV(
    lgbm,
    lgb_param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
lgb_search.fit(X_train, y_train)
best_lgb = lgb_search.best_estimator_
y_pred_lgb = best_lgb.predict_proba(X_test)[:,1]
print("LGBM best AUC:", roc_auc_score(y_test, y_pred_lgb))


Fitting 3 folds for each of 30 candidates, totalling 90 fits
[LightGBM] [Info] Number of positive: 7348, number of negative: 816
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.143967 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 8164, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7b9aa2308ea0>
Traceback (most recent call last):
  File "/home/lakmina/Downloads/Cache/data-storm-6-bsb/venv/lib/python3.12/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function _log_callback at 0x739afc114d60>
Traceback (most recent call last):
  File "/home/lakmina/Downloads/Cache/data-storm-6-bsb/venv/lib/python3.12/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function _log_callback at 0x7f1e1d564d60>
Traceback (most recent call last):
  File "/home/lakmina/Downloads/Cache/data-storm-6-bsb/venv/lib/python3.12/site-packages/lightgbm/basic.py", line 287, in _log_callback
    def _log_callback(msg: bytes) -> None:
    
KeyboardI

[LightGBM] [Info] Number of positive: 7348, number of negative: 816
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.264509 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1670
[LightGBM] [Info] Number of data points in the train set: 8164, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 7348, number of negative: 816
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001258 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1669
[LightGBM] [Info] Number of data points in the train set: 8164, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


KeyboardInterrupt: 

In [None]:


# f1_scorer = make_scorer(f1_score, average='weighted')
# rf_search_f1 = RandomizedSearchCV(
#     rf,
#     rf_param_dist,
#     n_iter=20,
#     scoring=f1_scorer,
#     cv=3,
#     random_state=42,
#     n_jobs=-1
# )
# rf_search_f1.fit(X_train, y_train)
# print("RF best weighted F1:", rf_search_f1.best_score_)


In [None]:

#Stacking the Ensembled learners

estimators = [
    ('lr', lr),
    ('rf', best_rf),
    ('lgb', best_lgb)
]
stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=3,
    n_jobs=-1,
    passthrough=True
)
stack.fit(X_train, y_train)
y_pred_stack = stack.predict_proba(X_test)[:,1]
print("Stacking AUC:", roc_auc_score(y_test, y_pred_stack))

results = {
    'LogisticRegression': roc_auc_score(y_test, y_pred_lr),
    'RandomForest':      roc_auc_score(y_test, y_pred_rf),
    'LightGBM':          roc_auc_score(y_test, y_pred_lgb),
    'Stacking':          roc_auc_score(y_test, y_pred_stack)
}
print("AUC comparison:", results)

In [None]:
#Hyperparameter Search

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, f1_score

param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth':    [None, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5]
}

search_auc = RandomizedSearchCV(
    clf,
    param_dist,
    n_iter=30,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1
)
search_auc.fit(X_train, y_train)
best_auc_model = search_auc.best_estimator_

f1_scorer = make_scorer(f1_score, average='weighted')
search_f1 = RandomizedSearchCV(
    clf,
    param_dist,
    n_iter=30,
    scoring=f1_scorer,
    cv=3,
    random_state=42,
    n_jobs=-1
)
search_f1.fit(X_train, y_train)
best_f1_model = search_f1.best_estimator_

In [49]:
#Bayesian optimization


import optuna
from sklearn.model_selection import cross_val_score

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth    = trial.suggest_int('max_depth', 5, 20)
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        class_weight='balanced',
        random_state=42
    )
    score = cross_val_score(clf, X_train, y_train, cv=3,
                            scoring='roc_auc', n_jobs=-1).mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
best_params = study.best_params
best_rf_opt = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
best_rf_opt.fit(X_train, y_train)

ModuleNotFoundError: No module named 'optuna'