In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.base import BaseEstimator, ClassifierMixin

class LGBMWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, **kwargs):
        self.model = lgb.LGBMClassifier(**kwargs)
        self.kwargs = kwargs

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.model.fit(X, y)
        return self

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

    def get_params(self, deep=True):
        return self.kwargs

    def set_params(self, **params):
        self.kwargs.update(params)
        return self


data = pd.read_csv('/content/data.csv')

interaction_features = {
    'JobSatisfaction_EnvSatisfaction': data['JobSatisfaction'] * data['EnvironmentSatisfaction'],
    'YearsInCurrentRole_JobInvolvement': data['YearsInCurrentRole'] * data['JobInvolvement'],
    'TotalWorkingYears_Education': data['TotalWorkingYears'] * data['Education'],
    'PerformanceRating_Incentive': data['PerformanceRating'] * data['Incentive'],
    'YearsAtCompany_Incentive': data['YearsAtCompany'] * data['Incentive'],
    'DistanceFromHome_WorkLifeBalance': data['DistanceFromHome'] * data['WorkLifeBalance'],
    'WorkLifeBalance_JobSatisfaction': data['WorkLifeBalance'] * data['JobSatisfaction'],
    'Age_to_JobSatisfaction': data['Age'] / (data['JobSatisfaction'] + 1),
    'MonthlyIncome_to_JobLevel': data['MonthlyIncome'] / (data['JobLevel'] + 1),
}
data = data.assign(**interaction_features)

data['Attrition'] = data['Attrition'].apply(lambda x: 1 if x == 'Yes' else 0)

X = data.drop(columns=['Attrition', 'EmployeeCount', 'EmployeeNumber'])
y = data['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weight_dict = dict(zip(np.unique(y), class_weights))

base_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight=class_weight_dict),
    'Decision Tree': DecisionTreeClassifier(class_weight=class_weight_dict),
    'Random Forest': RandomForestClassifier(class_weight=class_weight_dict),
    'XGBoost': XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=class_weights[1]/class_weights[0]
    )
}

try:
    base_models['LightGBM'] = LGBMWrapper(class_weight='balanced')
except Exception as e:
    print(f"Error initializing LightGBM: {str(e)}")

param_grid_rf = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [3, 5, 10, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

results = {}
for name, model in base_models.items():
    try:
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('clf', model)
        ])

        if name == 'Random Forest':
            grid = RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=param_grid_rf,
                cv=5,
                n_jobs=-1,
                scoring='accuracy',
                random_state=42
            )
            grid.fit(X_train, y_train)
            y_pred = grid.predict(X_test)
            y_pred_proba = grid.predict_proba(X_test)

            results[name] = {
                'Best Params': grid.best_params_,
                'Model': grid.best_estimator_
            }
        else:
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            y_pred_proba = pipeline.predict_proba(X_test)
            results[name] = {'Model': pipeline}

        results[name].update({
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred),
            'AUC': roc_auc_score(y_test, y_pred_proba[:, 1]),
            'Confusion Matrix': confusion_matrix(y_test, y_pred),
            'Classification Report': classification_report(y_test, y_pred)
        })

    except Exception as e:
        print(f"Error training {name}: {str(e)}")
        continue

for model_name, metrics in results.items():
    print(f"{model_name} Results:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"F1 Score: {metrics['F1 Score']:.4f}")
    print(f"AUC: {metrics['AUC']:.4f}")
    print("\nConfusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("\nClassification Report:")
    print(metrics['Classification Report'])

    if 'Best Params' in metrics:
        print("\nBest Parameters:")
        print(metrics['Best Params'])

Parameters: { "use_label_encoder" } are not used.



Error training XGBoost: 'super' object has no attribute '__sklearn_tags__'
[LightGBM] [Info] Number of positive: 172, number of negative: 857
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000362 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2498
[LightGBM] [Info] Number of data points in the train set: 1029, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Logistic Regression Results:
Accuracy: 0.7664
F1 Score: 0.5072
AUC: 0.8383

Confusion Matrix:
[[285  91]
 [ 12  53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.76      0.85       376
           1       0.37      0.82      0.51        65

    accuracy                           0.77       441
   macro avg 

