In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression


# 1. Load Data

train = pd.read_csv("BinaryTrain.csv")
test = pd.read_csv("TestBinary.csv")


#  Drop exact duplicates

dup_count = train.duplicated().sum()
print("Duplicate rows found:", dup_count)

if dup_count > 0:
    train = train.drop_duplicates()
    print("Duplicates removed. New shape:", train.shape)

# 2. Split Target + Features
TARGET = "retention_status"
ID_COL = "founder_id"

y = train[TARGET]
X = train.drop(columns=[TARGET, ID_COL])
X_test = test.drop(columns=[ID_COL])

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)


# 3. Handle Rare Categories in Categorical Columns

cat_raw_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

min_count = 50  # you can tweak this

for col in cat_raw_cols:
    vc = X[col].value_counts()
    rare_labels = vc[vc < min_count].index
    if len(rare_labels) > 0:
        X[col] = X[col].replace(rare_labels, "Other")
        # apply same mapping to test set
        X_test[col] = X_test[col].where(~X_test[col].isin(rare_labels), "Other")


# 4. Preprocessing with Feature Engineering

num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    # polynomial features only on numeric columns
    ("poly", PolynomialFeatures(
        degree=2,
        include_bias=False,
        interaction_only=True  # only interactions + linear
    ))
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore",
        drop="first",          # avoid perfect multicollinearity
        sparse_output=False    # if using sklearn >= 1.2; else use sparse=False
    ))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# 5. Fit & transform full training and test data

X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

print("Processed train shape:", X_processed.shape)
print("Processed test shape:", X_test_processed.shape)

# 6. Train/Validation Split (for hyperparameter tuning)

X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(
    X_processed, y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

# 7. Hyperparameter Tuning: C + Threshold
C_values = [0.01, 0.03, 0.1, 0.3, 1.0, 3.0, 5.0, 10.0]

best_f1 = -1
best_C = None
best_thresh = None

for C in C_values:
    logreg = LogisticRegression(
        penalty="l2",
        C=C,
        solver="lbfgs",
        max_iter=1000,
        class_weight="balanced",
        n_jobs=-1
    )

    logreg.fit(X_train_s, y_train_s)
    val_probs = logreg.predict_proba(X_val_s)[:, 1]

    # threshold search
    for thr in np.linspace(0.15, 0.85, 71):
        preds = (val_probs >= thr).astype(int)
        score = f1_score(y_val_s, preds, average="weighted")

        if score > best_f1:
            best_f1 = score
            best_C = C
            best_thresh = thr

print("\n===== BEST HYPERPARAMETERS FOUND =====")
print(f"Best C: {best_C}")
print(f"Best Threshold: {best_thresh:.4f}")
print(f"Best Validation F1: {best_f1:.4f}")

# Train model with best C to inspect val performance once
best_model = LogisticRegression(
    penalty="l1",
    C=best_C,
    solver="liblinear",
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)
best_model.fit(X_train_s, y_train_s)
val_probs = best_model.predict_proba(X_val_s)[:, 1]
val_preds = (val_probs >= best_thresh).astype(int)

print("\nValidation Classification Report (with tuned C & threshold):")
print(classification_report(y_val_s, val_preds, target_names=le.classes_))

# 8. Train Final Logistic Regression on Full Data

logreg_final = LogisticRegression(
    penalty="l1",
    C=best_C,
    solver="liblinear",
    max_iter=1000,
    class_weight="balanced",
    n_jobs=-1
)

print("\nTraining FINAL Logistic Regression on full data...")
logreg_final.fit(X_processed, y_encoded)

# 9. Test Predictions with Best Threshold
test_probs = logreg_final.predict_proba(X_test_processed)[:, 1]
test_preds_encoded = (test_probs >= best_thresh).astype(int)
test_preds_labels = le.inverse_transform(test_preds_encoded)


# 10. Save Submission File

submission = pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": test_preds_labels
})

submission.to_csv("submission_logreg_optimized.csv", index=False)
print("\nLogistic Regression submission saved as submission_logreg_optimized.csv")
print(submission.head())