In [1]:
import sys
import subprocess

# Function to install a package
def install_package(package_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# Install TensorFlow
try:
    import tensorflow
    print(f"TensorFlow is already installed. Version: {tensorflow.__version__}")
except ModuleNotFoundError:
    print("TensorFlow not found. Installing TensorFlow...")
    install_package("tensorflow")
    print("TensorFlow installation complete.")


# Install hyperopt
try:
    import hyperopt
    print(f"Hyperopt is already installed. Version: {hyperopt.__version__}")
except ModuleNotFoundError:
    print("Hyperopt not found. Installing Hyperopt...")
    install_package("hyperopt")
    print("Hyperopt installation complete.")

# Import necessary libraries
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score

# Gradient Boosting Libraries
from lightgbm import LGBMClassifier

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set seaborn style
sns.set(style="whitegrid")

# Load the CSV file
file_path = r'C:\Users\Hubert N\Downloads\8.31.22 STAR Data Spreadsheet (2).csv'
df = pd.read_csv(file_path)

# Define columns of interest
columns_of_interest = [
    "Sex (0 = female; 1 = male)",
    "Neuter status (0 = castrated; 1 = not castrated)",
    "Age",
    "Duration of U-cath placement (days)",
    "U-cath placement (no = 0; yes = 1)",
    "Number of vet visits",
    'Collection Type (Cysto = 0; other =1; unknown = 2)',
    'Color (0 = yellow/dark yellow/amber; 1 = straw; 2 = red/pink; 3 = brown/light & dark brown; 4 = other)',
    'Clarity (0 = clear; 1 = opaque/cloudy; 2 = SLhazy; 3 = hazy)',
    'USG', 'PH', 'Protein', 'Glucose', 'Ketones',
    'Bilirubin', 'Hemoprotein', 'Sediment Vol', 'Volume',
    'Lipid Layer', 'WBC (simplified)', 'Pyuria (1 if >/=4)',
    'RBC (simplified)', 'Crystals (0 = none 1 = rare; 2 = few; 3 = mod; 4 = many)',
    'Triple Phos', 'Bilirubin.1',
    'Epithelial Cells-Transitional', 'Epithelial Cells-Squamous',
    'Epithelial Cells-Renal', 'Epithelial Cells-Caudate',
    'Casts-Hyaline', 'Casts-Granular',
    'Bacteria quantity', 'Bac Type', 'Lipid Droplets', 'Sperm'
]

# Ensure all columns are present
missing_columns = [col for col in columns_of_interest if col not in df.columns]
if missing_columns:
    print("Missing columns:", missing_columns)
else:
    features = df[columns_of_interest]

    # Filter and encode the target variable
    df = df[df["Ampicillin"].isin(["R", "S"])]  # Keep only 'R' and 'S' labels
    df['Ampicillin'] = df['Ampicillin'].map({'S': 0, 'R': 1})
    target = df['Ampicillin']

    # Ensure features and target are aligned
    features = features.loc[target.index]

    # Manually define categorical columns
    categorical_cols = [
        "Sex (0 = female; 1 = male)",
        "Neuter status (0 = castrated; 1 = not castrated)",
        "U-cath placement (no = 0; yes = 1)",
        'Collection Type (Cysto = 0; other =1; unknown = 2)',
        'Color (0 = yellow/dark yellow/amber; 1 = straw; 2 = red/pink; 3 = brown/light & dark brown; 4 = other)',
        'Clarity (0 = clear; 1 = opaque/cloudy; 2 = SLhazy; 3 = hazy)',
        'Lipid Layer', 'Pyuria (1 if >/=4)',
        'Crystals (0 = none 1 = rare; 2 = few; 3 = mod; 4 = many)',
        'Triple Phos', 'Bilirubin.1',
        'Epithelial Cells-Transitional', 'Epithelial Cells-Squamous',
        'Epithelial Cells-Renal', 'Epithelial Cells-Caudate',
        'Casts-Hyaline', 'Casts-Granular',
        'Bacteria quantity', 'Bac Type', 'Lipid Droplets', 'Sperm'
    ]

    # Manually define numerical columns
    numerical_cols = [col for col in features.columns if col not in categorical_cols]

    # Convert categorical columns to 'object' dtype
    features[categorical_cols] = features[categorical_cols].astype('object')

    # Impute missing values for numerical columns with median
    numerical_imputer = SimpleImputer(strategy='median')
    features[numerical_cols] = numerical_imputer.fit_transform(features[numerical_cols])

    # Impute missing values for categorical columns with mode
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    features[categorical_cols] = categorical_imputer.fit_transform(features[categorical_cols])

    # One-hot encode categorical variables
    features = pd.get_dummies(features, columns=categorical_cols, drop_first=True)

    # Standardize numerical features
    scaler = StandardScaler()
    features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        features, target, test_size=0.2, random_state=42, stratify=target
    )

    # -------------------- LightGBM with LGBMClassifier --------------------

    # Split the dataset into training and validation sets
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
    )

    # Define parameters
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9
    }

    # Initialize LGBMClassifier
    lgb_clf = LGBMClassifier(**lgb_params, n_estimators=100)

    # Fit the model
    lgb_clf.fit(
        X_train_split, y_train_split,
        eval_set=[(X_val_split, y_val_split)],
        eval_metric='logloss'
    )

    # Predict on the test set
    y_pred_lgb = lgb_clf.predict(X_test)

    # Evaluate the model
    accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
    print(f"LightGBM Model Accuracy: {accuracy_lgb:.4f}")

    print("LightGBM Classification Report:")
    print(classification_report(y_test, y_pred_lgb))


TensorFlow is already installed. Version: 2.18.0
Hyperopt is already installed. Version: 0.2.7
LightGBM Model Accuracy: 0.7222
LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.96      0.84        80
           1       0.25      0.04      0.06        28

    accuracy                           0.72       108
   macro avg       0.50      0.50      0.45       108
weighted avg       0.61      0.72      0.64       108



In [25]:
import sys
import subprocess

# Function to install a package
def install_package(package_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# Install necessary libraries
try:
    import lightgbm as lgb
except ModuleNotFoundError:
    install_package("lightgbm")
    import lightgbm as lgb

try:
    from catboost import CatBoostClassifier
except ModuleNotFoundError:
    install_package("catboost")
    from catboost import CatBoostClassifier

try:
    import xgboost as xgb
except ModuleNotFoundError:
    install_package("xgboost")
    import xgboost as xgb

try:
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.combine import SMOTEENN
except ModuleNotFoundError:
    install_package("imbalanced-learn")
    from imblearn.over_sampling import SMOTE, ADASYN
    from imblearn.combine import SMOTEENN

try:
    import optuna
except ModuleNotFoundError:
    install_package("optuna")
    import optuna

# Import necessary libraries
import pandas as pd
import numpy as np

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Load the CSV file
file_path = r'C:\Users\Hubert N\Downloads\8.31.22 STAR Data Spreadsheet (2).csv'
df = pd.read_csv(file_path)

# Define columns of interest
columns_of_interest = [
    "Sex (0 = female; 1 = male)",
    "Neuter status (0 = castrated; 1 = not castrated)",
    "Age",
    "Duration of U-cath placement (days)",
    "U-cath placement (no = 0; yes = 1)",
    "Number of vet visits",
    'Collection Type (Cysto = 0; other =1; unknown = 2)',
    'Color (0 = yellow/dark yellow/amber; 1 = straw; 2 = red/pink; 3 = brown/light & dark brown; 4 = other)',
    'Clarity (0 = clear; 1 = opaque/cloudy; 2 = SLhazy; 3 = hazy)',
    'USG', 'PH', 'Protein', 'Glucose', 'Ketones',
    'Bilirubin', 'Hemoprotein', 'Sediment Vol', 'Volume',
    'Lipid Layer', 'WBC (simplified)', 'Pyuria (1 if >/=4)',
    'RBC (simplified)', 'Crystals (0 = none 1 = rare; 2 = few; 3 = mod; 4 = many)',
    'Triple Phos', 'Bilirubin.1',
    'Epithelial Cells-Transitional', 'Epithelial Cells-Squamous',
    'Epithelial Cells-Renal', 'Epithelial Cells-Caudate',
    'Casts-Hyaline', 'Casts-Granular',
    'Bacteria quantity', 'Bac Type', 'Lipid Droplets', 'Sperm'
]

# Ensure all columns are present
missing_columns = [col for col in columns_of_interest if col not in df.columns]
if missing_columns:
    print("Missing columns:", missing_columns)
    sys.exit()

# Filter and encode the target variable
df = df[df["Ampicillin"].isin(["R", "S"])]  # Keep only 'R' and 'S' labels
df['Ampicillin'] = df['Ampicillin'].map({'S': 0, 'R': 1})
target = df['Ampicillin']

# Select features
features = df[columns_of_interest]

# Manually define categorical columns
categorical_cols = [
    "Sex (0 = female; 1 = male)",
    "Neuter status (0 = castrated; 1 = not castrated)",
    "U-cath placement (no = 0; yes = 1)",
    'Collection Type (Cysto = 0; other =1; unknown = 2)',
    'Color (0 = yellow/dark yellow/amber; 1 = straw; 2 = red/pink; 3 = brown/light & dark brown; 4 = other)',
    'Clarity (0 = clear; 1 = opaque/cloudy; 2 = SLhazy; 3 = hazy)',
    'Lipid Layer', 'Pyuria (1 if >/=4)',
    'Crystals (0 = none 1 = rare; 2 = few; 3 = mod; 4 = many)',
    'Triple Phos', 'Bilirubin.1',
    'Epithelial Cells-Transitional', 'Epithelial Cells-Squamous',
    'Epithelial Cells-Renal', 'Epithelial Cells-Caudate',
    'Casts-Hyaline', 'Casts-Granular',
    'Bacteria quantity', 'Bac Type', 'Lipid Droplets', 'Sperm'
]

# Update categorical_cols based on available columns
categorical_cols = [col for col in categorical_cols if col in features.columns]
numerical_cols = [col for col in features.columns if col not in categorical_cols]

# Data Preprocessing Enhancements

# Impute missing values
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

features[numerical_cols] = imputer_num.fit_transform(features[numerical_cols])
features[categorical_cols] = imputer_cat.fit_transform(features[categorical_cols])

# Scale numerical features
scaler = StandardScaler()
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

# One-hot encode categorical variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_cat = pd.DataFrame(encoder.fit_transform(features[categorical_cols]), index=features.index)
encoded_cat.columns = encoder.get_feature_names_out(categorical_cols)

# Combine scaled numerical and encoded categorical features
features_preprocessed = pd.concat([features[numerical_cols], encoded_cat], axis=1)

# Feature Engineering

# Generate polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(features_preprocessed)
poly_feature_names = poly.get_feature_names_out(features_preprocessed.columns)
features_poly = pd.DataFrame(poly_features, columns=poly_feature_names, index=features.index)

# Feature Selection using RandomForest
from sklearn.feature_selection import SelectFromModel

# Initial feature selection using RandomForest
rf = RandomForestClassifier(random_state=42)
rf.fit(features_poly, target)

selector = SelectFromModel(rf, threshold='median', prefit=True)
features_selected = selector.transform(features_poly)
selected_feature_names = features_poly.columns[selector.get_support()]

# Update features
features_final = pd.DataFrame(features_selected, columns=selected_feature_names, index=features.index)

# Handling Class Imbalance

# Try different oversampling techniques and select the best one based on AUC score
oversampling_methods = {
    'SMOTE': SMOTE(random_state=42),
    'ADASYN': ADASYN(random_state=42),
    'SMOTEENN': SMOTEENN(random_state=42)
}

best_auc = 0
best_method = None
X_best_resampled = None
y_best_resampled = None

for method_name, sampler in oversampling_methods.items():
    if method_name == 'SMOTEENN':
        X_resampled, y_resampled = sampler.fit_resample(features_final, target)
    else:
        X_resampled, y_resampled = sampler.fit_resample(features_final, target)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
    )
    
    # Simple model for evaluation
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    
    if auc > best_auc:
        best_auc = auc
        best_method = method_name
        X_best_resampled = X_resampled
        y_best_resampled = y_resampled

# Use the best oversampling method
print(f"Best oversampling method: {best_method} with AUC: {best_auc:.4f}")

# Use the resampled data from the best method
X_resampled = X_best_resampled
y_resampled = y_best_resampled

# Model Selection and Hyperparameter Tuning

# Define models to try
models = {
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='auc', use_label_encoder=False),
    'RandomForest': RandomForestClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, eval_metric='AUC', silent=True),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Split the data for testing
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

model_aucs = {}

# Hyperparameter tuning for LightGBM using Optuna
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 10),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 10)
    }
    model = lgb.LGBMClassifier(**param, random_state=42)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_lgb = lgb.LGBMClassifier(**best_params, random_state=42)
best_lgb.fit(X_train, y_train)
y_pred_proba = best_lgb.predict_proba(X_test)[:, 1]
auc_lgb = roc_auc_score(y_test, y_pred_proba)
model_aucs['LightGBM'] = auc_lgb
print(f"LightGBM AUC: {auc_lgb:.4f}")

# Evaluate other models
for model_name, model in models.items():
    if model_name == 'LightGBM':
        continue  # Already evaluated
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    model_aucs[model_name] = auc
    print(f"{model_name} AUC: {auc:.4f}")

# Cross-Validation and Evaluation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_auc_scores = cross_val_score(best_lgb, X_resampled, y_resampled, cv=cv, scoring='roc_auc', n_jobs=-1)
print(f"Cross-Validated AUC Scores for LightGBM: {cv_auc_scores}")
print(f"Mean Cross-Validated AUC Score for LightGBM: {cv_auc_scores.mean():.4f}")

# Ensemble Methods
voting_clf = VotingClassifier(
    estimators=[
        ('lgb', best_lgb),
        ('xgb', models['XGBoost']),
        ('rf', models['RandomForest']),
        ('cat', models['CatBoost'])
    ],
    voting='soft',
    n_jobs=-1
)

voting_clf.fit(X_train, y_train)
y_pred_proba_voting = voting_clf.predict_proba(X_test)[:, 1]
auc_voting = roc_auc_score(y_test, y_pred_proba_voting)
model_aucs['VotingClassifier'] = auc_voting
print(f"VotingClassifier AUC: {auc_voting:.4f}")

# Print the highest AUC score achieved
highest_auc_model = max(model_aucs, key=model_aucs.get)
highest_auc = model_aucs[highest_auc_model]
print(f"\nHighest AUC Score Achieved: {highest_auc:.4f} with {highest_auc_model}")



[I 2024-11-24 04:31:32,424] A new study created in memory with name: no-name-e59ed1db-9d8c-4518-bc14-bcb95ab992d3


Best oversampling method: SMOTE with AUC: 0.9468


[I 2024-11-24 04:31:33,826] Trial 0 finished with value: 0.9054687499999999 and parameters: {'max_depth': 4, 'num_leaves': 24, 'learning_rate': 0.009897060728725215, 'n_estimators': 611, 'min_child_samples': 13, 'subsample': 0.716047607256282, 'colsample_bytree': 0.5480697263274124, 'reg_alpha': 3.151981407188883, 'reg_lambda': 0.0021012713122172157}. Best is trial 0 with value: 0.9054687499999999.
[I 2024-11-24 04:31:34,310] Trial 1 finished with value: 0.89171875 and parameters: {'max_depth': 5, 'num_leaves': 40, 'learning_rate': 0.014485704892427365, 'n_estimators': 739, 'min_child_samples': 79, 'subsample': 0.6752690562404828, 'colsample_bytree': 0.6877012762729982, 'reg_alpha': 0.00367966087309512, 'reg_lambda': 9.385358133702345e-05}. Best is trial 0 with value: 0.9054687499999999.
[I 2024-11-24 04:31:34,753] Trial 2 finished with value: 0.8729687499999998 and parameters: {'max_depth': 15, 'num_leaves': 21, 'learning_rate': 0.005815149937267976, 'n_estimators': 640, 'min_child_sa

LightGBM AUC: 0.9266
XGBoost AUC: 0.9031
RandomForest AUC: 0.9468
CatBoost AUC: 0.9261
LogisticRegression AUC: 0.8575


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.