In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture
from sklearn.metrics import classification_report, f1_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import f1_score, classification_report


# 1. Load Data
train = pd.read_csv('BinaryTrain.csv')
test = pd.read_csv('TestBinary.csv')

TARGET = 'retention_status'
ID_COL = 'founder_id'

X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET]

X_test = test.drop(columns=[ID_COL], errors='ignore')

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 2. Preprocessing

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop'
)

X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# 3. Balance the Dataset (CRUCIAL FOR GMM)

rus = RandomUnderSampler(random_state=42)
X_bal, y_bal = rus.fit_resample(X_processed, y_encoded)

print("Balanced class counts:", np.bincount(y_bal))

# 4. Validation Split

X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(
    X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
)


# Split X, y by class
# ---------------------------------------------------------
X0 = X_train_s[y_train_s == 0]
X1 = X_train_s[y_train_s == 1]

# ---------------------------------------------------------
# Hyperparameters to try
# ---------------------------------------------------------
n_components_list = [2, 3, 4, 5, 6, 8]
cov_types = ["full", "diag", "tied"]

best_f1 = 0
best_params = None
best_threshold = 0.5

for k in n_components_list:
    for cov in cov_types:
        #Fit two GMMs (one per class)
        gmm0 = GaussianMixture(n_components=k, covariance_type=cov,
                               reg_covar=1e-5, max_iter=300, n_init=3)
        gmm1 = GaussianMixture(n_components=k, covariance_type=cov,
                               reg_covar=1e-5, max_iter=300, n_init=3)

        gmm0.fit(X0)
        gmm1.fit(X1)

        # Compute log likelihoods
        log_p0 = gmm0.score_samples(X_val_s) + np.log(len(X0) / len(X_train_s))
        log_p1 = gmm1.score_samples(X_val_s) + np.log(len(X1) / len(X_train_s))

        # posterior prob=sigmoid(log_p1 - log_p0)
        posterior = 1 / (1 + np.exp(log_p0 - log_p1))

        # tune threshold
        for thr in np.linspace(0.1, 0.9, 81):
            preds = (posterior >= thr).astype(int)
            f1 = f1_score(y_val_s, preds, average="weighted")

            if f1 > best_f1:
                best_f1 = f1
                best_params = (k, cov)
                best_threshold = thr

print("\n=== BEST GMM FOUND ===")
print("Components:", best_params[0])
print("Covariance:", best_params[1])
print("Threshold:", best_threshold)
print("Val Weighted F1:", best_f1)

# Train FINAL GMMs on full training data
k, cov = best_params

gmm0_final = GaussianMixture(n_components=k, covariance_type=cov,
                             reg_covar=1e-5, max_iter=500, n_init=5)
gmm1_final = GaussianMixture(n_components=k, covariance_type=cov,
                             reg_covar=1e-5, max_iter=500, n_init=5)

gmm0_final.fit(X_bal[y_bal == 0])
gmm1_final.fit(X_bal[y_bal == 1])

# Predict on test set

log_p0_test = gmm0_final.score_samples(X_test_processed) + np.log(sum(y_bal==0)/len(y_bal))
log_p1_test = gmm1_final.score_samples(X_test_processed) + np.log(sum(y_bal==1)/len(y_bal))

posterior_test = 1 / (1 + np.exp(log_p0_test - log_p1_test))

final_preds = (posterior_test >= best_threshold).astype(int)
final_preds_labels = le.inverse_transform(final_preds)

submission = pd.DataFrame({
    'founder_id': test['founder_id'],
    'retention_status': final_preds_labels
})

submission.to_csv("submission_gmm_optimised.csv", index=False)
print("Saved final GMM submission!")