<a href="https://colab.research.google.com/github/jc890/python/blob/master/HRanalyticsCodingassessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import lightgbm as lgb

# ==========================================
# Load Data
# ==========================================
train = pd.read_csv('/content/train_LZdllcl.csv')
test = pd.read_csv('/content/test_2umaH9m.csv')
sample = pd.read_csv('/content/sample_submission_M0L0uXE.csv')



In [2]:
# ==========================================
# Fill Missing Values
# ==========================================
train['education'].fillna(train['education'].mode()[0], inplace=True)
test['education'].fillna(train['education'].mode()[0], inplace=True)

train['previous_year_rating'].fillna(train['previous_year_rating'].median(), inplace=True)
test['previous_year_rating'].fillna(train['previous_year_rating'].median(), inplace=True)

# ==========================================
# Feature Engineering
# ==========================================
for df in [train, test]:
    df['trainings_per_year'] = df['no_of_trainings'] / (df['length_of_service'] + 1)
    df['is_experienced'] = (df['length_of_service'] > 5).astype(int)
    df['is_high_perf'] = ((df['KPIs_met >80%'] == 1) & (df['previous_year_rating'] >= 4)).astype(int)
    df['region_freq'] = df['region'].map(train['region'].value_counts())
    df['region_freq'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['education'].fillna(train['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['education'].fillna(train['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inter

In [3]:

# ==========================================
# Encoding
# ==========================================
cat_features = ['department', 'region', 'education', 'gender', 'recruitment_channel']
lbl = LabelEncoder()
for col in cat_features:
    combined = pd.concat([train[col], test[col]]).astype(str)
    lbl.fit(combined)
    train[col] = lbl.transform(train[col].astype(str))
    test[col] = lbl.transform(test[col].astype(str))


In [4]:

# ==========================================
# Split Features and Target
# ==========================================
X = train.drop(['employee_id', 'is_promoted'], axis=1)
y = train['is_promoted']
X_test = test.drop(['employee_id'], axis=1)


In [14]:
# ==========================================
# Custom F1 evaluation with threshold scan
# ==========================================
fold_thresholds = []  # store best threshold per fold

def lgb_f1_score_threshold_search(y_pred, dataset):
    # Extract true labels
    y_true = dataset.get_label()

    best_f1 = 0
    best_thresh = 0
    for t in np.linspace(0.1, 0.9, 50):
        f1 = f1_score(y_true, (y_pred > t).astype(int))
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t

    # Store threshold per fold for later use
    fold_thresholds.append(best_thresh)

    # Return in (name, value, is_higher_better) format
    return 'f1', best_f1, True


callbacks = [
    lgb.early_stopping(stopping_rounds=50, verbose=True),
    lgb.log_evaluation(period=100)  # optional, logs every 100 rounds
]



In [15]:
# ==========================================
# LightGBM parameters
# ==========================================
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'binary_logloss',  # internal loss
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# ==========================================
# Training with Stratified K-Fold
# ==========================================
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))

for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
    print(f"\n---- Fold {fold+1} ----")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dval = lgb.Dataset(X_val, y_val, reference=dtrain)

    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dval],
        feval=lgb_f1_score_threshold_search,
        callbacks=callbacks
    )

    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    test_pred = model.predict(X_test, num_iteration=model.best_iteration)

    oof_preds[val_idx] = val_pred
    test_preds += test_pred / folds.n_splits




---- Fold 1 ----
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.162306	valid_0's f1: 0.536765

---- Fold 2 ----
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.170079	valid_0's f1: 0.491803

---- Fold 3 ----
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.167639	valid_0's f1: 0.535588

---- Fold 4 ----
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.162473	valid_0's f1: 0.549769

---- Fold 5 ----
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.170228	valid_0's f1: 0.507726


In [16]:

# ==========================================
# Apply average best threshold across folds
# ==========================================
avg_thresh = np.mean(fold_thresholds)
print(f"\nAverage Best Threshold Across Folds: {avg_thresh:.3f}")

final_preds = (test_preds > avg_thresh).astype(int)




Average Best Threshold Across Folds: 0.227


In [17]:
# ==========================================
# Safe Submission
# ==========================================
pred_df = pd.DataFrame({'employee_id': test['employee_id'], 'is_promoted': final_preds})
submission = sample[['employee_id']].merge(pred_df, on='employee_id', how='left')
submission.to_csv('submission.csv', index=False)
print("\n✅ Submission created with automatic F1-optimized threshold")


✅ Submission created with automatic F1-optimized threshold
